diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3dbd6e3..ee1522e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,52 +6,95 @@ on: tags: - "v*" # Semantic version tags pull_request: +permissions: + contents: read jobs: test: strategy: matrix: - py: ["3.9", "3.10", "3.11", "3.12", "3.13"] - runs-on: ubuntu-latest + py: ["3.11", "3.12", "3.13", "3.14"] + os: [ubuntu-latest, macos-latest] + exclude: + # Only test the latest stable Python on macOS to keep CI fast + - os: macos-latest + py: "3.11" + - os: macos-latest + py: "3.12" + - os: macos-latest + py: "3.14" + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 # full history fetch-tags: true # redundant in v4, but explicit - submodules: "recursive" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.py }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install submodules/daggerml_cli/ - pip install .[dev] + - name: Install uv + run: pip install uv==0.11.7 + - name: Sync dependencies + run: uv sync --group dev - name: pytest - run: python -m pytest -m "not slow" . + run: uv run pytest . + sanitize: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + fetch-tags: true + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.13" + - name: Install uv + run: pip install uv==0.11.7 + - name: Build with ASAN + UBSAN + env: + CMAKE_ARGS: "-DDML_ENABLE_ASAN=ON -DDML_ENABLE_UBSAN=ON" + run: uv sync --group dev + - name: Run tests with sanitizers + env: + ASAN_OPTIONS: "detect_leaks=0:abort_on_error=1" + UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1" + run: | + LIBASAN=$(gcc -print-file-name=libasan.so) + LD_PRELOAD="$LIBASAN" uv run pytest tests/contracts tests/contrib src/daggerml lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: "3.13" - - name: Install ruff - run: pip install ruff + - name: Install uv + run: pip install uv==0.11.7 + - name: Sync dependencies + run: uv sync --group dev - name: Run Ruff - run: ruff check --output-format=github . + run: uv run ruff check --output-format=github . publish: - needs: ["test", "lint"] + needs: ["test", "lint", "sanitize"] if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') name: Upload release to PyPI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: "3.13" + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-3.13-publish-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip-3.13- + ${{ runner.os }}-pip- - name: Install hatch run: | pip install hatch diff --git a/.gitignore b/.gitignore index 8ac7c9c..35697c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # daggerml .dml/ +.opencode/ # scm writes version info now... __about__.py @@ -86,6 +87,8 @@ instance/ # Sphinx documentation docs/_build/ docs/_autosummary/ +docs/contrib/plans/ +docs/contrib/*-plan.local.md # PyBuilder .pybuilder/ @@ -174,3 +177,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +uv.lock diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index c98b6ac..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "submodules/daggerml_cli"] - path = submodules/daggerml_cli - url = https://github.com/daggerml/daggerml-cli.git diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..e2ea359 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,8 @@ +# Agent Instructions + +- Before editing code, consult relevant docs via `DOC_MAP.md`. +- In your final summary/PR notes, list the docs you consulted. +- For contributor workflow rules and test layout, see `CONTRIBUTING.md`. +- For OpenSpec planning and spec-governance material, see `openspec/README.md`. +- When a function runs through the script executor (`@api.funkify(uri="script", ...)`), only the function source and explicitly injected `extra_objs`/`extra_lines` are available in the worker; module-level imports/globals are not. +- Keep script-executed functions self-contained: import dependencies inside the function body or inject them explicitly, or runtime `NameError` failures can appear at the call site. diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..2cfcbeb --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,116 @@ +cmake_minimum_required(VERSION 3.18) +project(daggerml LANGUAGES C) + +option(DML_ENABLE_ASAN "Enable AddressSanitizer" OFF) +option(DML_ENABLE_UBSAN "Enable UndefinedBehaviorSanitizer" OFF) + +set(_dml_sanitize_flags "") +if(DML_ENABLE_ASAN) + list(APPEND _dml_sanitize_flags "-fsanitize=address") +endif() +if(DML_ENABLE_UBSAN) + list(APPEND _dml_sanitize_flags "-fsanitize=undefined") +endif() +if(_dml_sanitize_flags) + list(APPEND _dml_sanitize_flags "-fno-omit-frame-pointer") +endif() + +find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) + +add_library(lmdb STATIC + c/third_party/lmdb/libraries/liblmdb/mdb.c + c/third_party/lmdb/libraries/liblmdb/midl.c +) +target_include_directories(lmdb PUBLIC c/third_party/lmdb/libraries/liblmdb) +set_property(TARGET lmdb PROPERTY POSITION_INDEPENDENT_CODE ON) +if(_dml_sanitize_flags) + target_compile_options(lmdb PRIVATE ${_dml_sanitize_flags}) + target_link_options(lmdb PRIVATE ${_dml_sanitize_flags}) +endif() + +add_library(msgpackc STATIC + c/third_party/msgpack/src/objectc.c + c/third_party/msgpack/src/unpack.c + c/third_party/msgpack/src/vrefbuffer.c + c/third_party/msgpack/src/zone.c + c/third_party/msgpack/src/version.c +) +target_include_directories(msgpackc PUBLIC c/third_party/msgpack/include) +set_property(TARGET msgpackc PROPERTY POSITION_INDEPENDENT_CODE ON) +if(_dml_sanitize_flags) + target_compile_options(msgpackc PRIVATE ${_dml_sanitize_flags}) + target_link_options(msgpackc PRIVATE ${_dml_sanitize_flags}) +endif() + +add_library(sha256 STATIC + c/third_party/sha256/sha256.c +) +target_include_directories(sha256 PUBLIC c/third_party/sha256) +set_property(TARGET sha256 PROPERTY POSITION_INDEPENDENT_CODE ON) +if(_dml_sanitize_flags) + target_compile_options(sha256 PRIVATE ${_dml_sanitize_flags}) + target_link_options(sha256 PRIVATE ${_dml_sanitize_flags}) +endif() + +add_library(daggerml_core STATIC + c/src/dml_db.c + c/src/dml_hash.c + c/src/dml_msgpack.c + c/src/dml_value.c +) +target_include_directories(daggerml_core PUBLIC c/include) +target_include_directories(daggerml_core PUBLIC c/third_party/msgpack/include) +set_property(TARGET daggerml_core PROPERTY POSITION_INDEPENDENT_CODE ON) +if(_dml_sanitize_flags) + target_compile_options(daggerml_core PRIVATE ${_dml_sanitize_flags}) + target_link_options(daggerml_core PRIVATE ${_dml_sanitize_flags}) +endif() + +set(_db_pyx ${CMAKE_CURRENT_SOURCE_DIR}/src/daggerml/_internal/_db.pyx) +set(_db_c ${CMAKE_CURRENT_BINARY_DIR}/_db.c) + +add_custom_command( + OUTPUT ${_db_c} + COMMAND ${Python_EXECUTABLE} -m cython -3 --output-file ${_db_c} ${_db_pyx} + DEPENDS ${_db_pyx} + COMMENT "Cythonizing _db.pyx" + VERBATIM +) + +add_library(_db MODULE + ${_db_c} +) +set(DB_EXPORTS "${CMAKE_CURRENT_BINARY_DIR}/db.exports") +if(APPLE) + file(WRITE "${DB_EXPORTS}" "_PyInit__db\n") + target_link_options(_db PRIVATE + "-Wl,-exported_symbols_list,${DB_EXPORTS}" + ) +else() + file(WRITE "${DB_EXPORTS}" "{_PyInit__db;};") + target_link_options(_db PRIVATE + "-Wl,--version-script=${DB_EXPORTS}" + ) +endif() +target_include_directories(_db PRIVATE c/third_party/lmdb/libraries/liblmdb) +target_include_directories(_db PRIVATE c/third_party/msgpack/include) +target_include_directories(_db PRIVATE c/include) +target_link_libraries(_db PRIVATE daggerml_core lmdb msgpackc sha256 Python::Module) +if(_dml_sanitize_flags) + target_compile_options(_db PRIVATE ${_dml_sanitize_flags}) + target_link_options(_db PRIVATE ${_dml_sanitize_flags}) + if(DML_ENABLE_ASAN AND NOT APPLE AND CMAKE_C_COMPILER_ID MATCHES "Clang") + target_link_options(_db PRIVATE "-shared-libasan") + endif() + if(DML_ENABLE_UBSAN AND CMAKE_C_COMPILER_ID MATCHES "Clang") + target_compile_options(_db PRIVATE "-fno-sanitize=function") + endif() +endif() + +set_target_properties(_db PROPERTIES PREFIX "" OUTPUT_NAME "_db") + +install(TARGETS _db + LIBRARY DESTINATION daggerml/_internal/ + RUNTIME DESTINATION daggerml/_internal/ + ARCHIVE DESTINATION daggerml/_internal/ +) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f40e9c4..5b504b4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,6 +3,12 @@ Thank you for your interest in contributing! We welcome contributions via pull requests and appreciate your help in improving this project. +## Contributor Workflow References + +- `AGENTS.md`: agent-specific working notes and script-executor caveats. +- `DOC_MAP.md`: which project docs to read before editing a given code path. +- `openspec/README.md`: change-planning artifacts and current OpenSpec spec surfaces. + ## Reporting Issues - Search [existing issues](https://github.com/daggerml/python-lib/issues) before submitting a new one. @@ -18,7 +24,7 @@ requests and appreciate your help in improving this project. 1. Create a new branch for your feature or bugfix (with the github issue in the name). 2. Clone the repository and set it up: ```bash - git clone --recurse-submodules https://github.com/daggerml/python-lib.git + git clone https://github.com/daggerml/python-lib.git ``` 3. Make your changes in the new branch. 4. Write or update tests as needed. @@ -37,30 +43,84 @@ requests and appreciate your help in improving this project. - Add or update unit tests for any new features or bug fixes. - Use [pytest](https://pytest.org/) for running tests. -- The testing requirements are included in the `test` feature for the library. - - You can run tests using [hatch](https://hatch.pypa.io/): - ``` - hatch run pytest . - ``` - - If you're using vscode, you can create a venv with the `test` feature and run tests with the command palette: - ``` - Python: Run Tests - ``` - - Or install the `test` feature with pip and run tests: - ``` - pip install -e [test] - pytest . - ``` +- Standard local dev command pattern is: + ```bash + uv run --dev + ``` +- When a command needs optional dependencies, include all extras: + ```bash + uv run --dev --all-extras + ``` +- Run tests with: + ```bash + uv run --dev --all-extras pytest . + ``` +- Run lint with: + ```bash + uv run --dev --all-extras ruff check --fix . + ``` - We mark tests with `@pytest.mark.slow` for those that take longer to run. You can run only the fast tests with: ``` - pytest -m "not slow" . + uv run --dev --all-extras pytest -m "not slow" . ``` +- CI continues to run the full suite (`uv run pytest .`) to preserve complete coverage while local quick loops use `-m "not slow"`. - We mark tests that require `daggerml-cli` to be installed with `@pytest.mark.needs_dml`. You can exclude those tests with: ``` - pytest -m "not needs_dml" . + uv run --dev --all-extras pytest -m "not needs_dml" . ``` - Run all tests locally before submitting a pull request: - Ensure your code passes all tests and does not decrease code coverage. - If your changes introduce new dependencies, please update `pyproject.toml`, but we prefer to keep the dependencies to a minimum. +### Test taxonomy and naming + +This section is for contributors maintaining or restructuring the test suite. + +#### Directory layout + +- `tests/contracts/`: fast, isolated tests that verify one documented requirement or invariant. +- `tests/integration/`: multi-component or infrastructure-dependent tests. +- Existing folders such as `tests/_internal/` and `tests/contrib/` may remain during migration, but new or refactored suites should target `tests/contracts/` or `tests/integration/`. + +#### File naming + +- Contract tests should use `test__.py`. +- Integration tests should use `test___integration.py`. +- Avoid generic names such as `test_core.py` when a more specific contract surface is known. + +#### Function naming and contract IDs + +- Prefer `test___()` where practical. +- Example: `test_exec_lc_003__resume_uses_launch_state()`. +- Specify canonical contract IDs directly as literal strings. +- Use uppercase category prefixes and numeric suffixes such as `ADP-OUT-001`, `EXEC-LC-003`, and `EST-LOCK-004`. +- For parameterized cases, include the canonical ID in `id=`, for example `id="EXEC-LC-003:resume-uses-launch-state"`. + +#### Lifecycle parameterization + +- Tests that exercise a lifecycle should prefer one parameterized test per contract family over multiple near-duplicate tests. +- Make lifecycle stages explicit in case IDs, for example `kickoff`, `resume`, `terminal-succeeded`, and `terminal-failed`. + +#### Marker policy + +- Integration tests that require external processes, polling loops, remote roundtrips, or significant runtime orchestration must be marked `@pytest.mark.slow`. +- Contract tests in `tests/contracts/` should stay unmarked and fast by default. + +#### Migration policy + +- Migration is full replacement, not indefinite dual maintenance. +- When a legacy test is superseded by a new contract-structured test, remove the legacy test in the same change set or immediately after parity is confirmed. +- During migration, preserve traceability by carrying canonical contract IDs into new parameterized case IDs. +- The end state is for all maintained tests to align to this taxonomy. + +## Migration Rollout Policy + +When migrating storage or execution paths, use phased rollouts with tests at each phase: + +1. Implement the new destination path first and test it. +2. Write to both old and new paths and test. +3. Read from the new path and test. +4. Stop writing to the old path and test. +5. Remove the old path and test. + Thank you for helping make this project better! diff --git a/DOC_MAP.md b/DOC_MAP.md new file mode 100644 index 0000000..0043def --- /dev/null +++ b/DOC_MAP.md @@ -0,0 +1,178 @@ +# Edit Doc Map + +Audience: coding agents and maintainers working on this repository. + +Use this file to identify which project docs to read before editing a code path, then list the docs you consulted in your summary or PR notes. + +## Global Docs (Always Read) + +- `docs/README.md`: docs index and layout conventions. +- `docs/architecture/system-overview.md`: system-level layering and subsystem boundaries. + +## Path Rules + +### Public Python API + +- Match: `src/daggerml/api.py`, `src/daggerml/__init__.py` +- Read: + - `docs/reference/python-api.md` + - `docs/concepts/dags-and-nodes.md` + - `docs/concepts/refs-and-namespaces.md` + - `docs/reference/errors.md` + - `docs/concepts/codecs-and-values.md` + - `docs/architecture/type-system.md` + +### Codec module + +- Match: `src/daggerml/codecs.py` +- Read: + - `docs/concepts/codecs-and-values.md` + - `docs/reference/python-api.md` + - `docs/contrib/reference/s3-and-codecs.md` + - `docs/reference/errors.md` + +### CLI surface + +- Match: `src/daggerml/_cli/**` +- Read: + - `docs/reference/cli.md` + - `docs/architecture/internal-modules.md` +- Also read: + - `docs/reference/errors.md` + +### Internal operations + +- Match: `src/daggerml/_internal/ops/**` +- Read: + - `docs/architecture/ops-layer.md` + - `docs/architecture/internal-modules.md` + - `docs/concepts/codecs-and-values.md` (when touching codec serialization/import behavior) +- Also read: + - the matching source module under `src/daggerml/_internal/ops/` for the file being changed + - example: editing `src/daggerml/_internal/ops/commit.py` -> read `docs/concepts/commits-and-history.md` and `docs/architecture/ops-layer.md` + +### Internal codec registry + +- Match: `src/daggerml/_internal/codec.py` +- Read: + - `docs/concepts/codecs-and-values.md` + - `docs/architecture/ops-layer.md` + +### Internal types and contracts + +- Match: `src/daggerml/_internal/types.py`, `src/daggerml/_internal/builtins.py` +- Read: + - `docs/concepts/refs-and-namespaces.md` + - `docs/architecture/type-system.md` + - `docs/reference/errors.md` + +### Internal storage / DB integration + +- Match: `src/daggerml/_internal/_db.pyx`, `src/daggerml/_internal/util.py` +- Read: + - `docs/concepts/storage.md` + - `docs/architecture/storage-internals.md` + - `docs/guides/store-and-load-external-data.md` + +### Runtime / execution flow + +- Match: `src/daggerml/_config.py`, `src/daggerml/util.py`, execution-related internals +- Read: + - `docs/reference/configuration.md` + - `docs/concepts/execution.md` + - `docs/architecture/remote-protocol.md` + - `docs/reference/errors.md` + +### C implementation and headers + +- Match: `c/src/**`, `c/include/**` +- Read: + - `c/README.md` + - `docs/concepts/storage.md` + - `docs/concepts/dags-and-nodes.md` +- Also read: + - `docs/architecture/storage-internals.md` when touching DB/reference behavior + +### Remote and sync behavior + +- Match: files related to remote/sync (for example `*_remote*`, remote ops/CLI) +- Read: + - `docs/architecture/remote-protocol.md` + - `docs/concepts/remotes.md` + - `docs/architecture/ops-layer.md` + +### Commit / DAG / head / index behavior + +- Match: files related to commit, dag, head, index (ops or CLI) +- Read: + - `docs/concepts/commits-and-history.md` + - `docs/concepts/dags-and-nodes.md` + - `docs/architecture/ops-layer.md` + +### Contrib modules + +- Match: `src/daggerml/contrib/**` +- Read: + - `docs/contrib/README.md` + - `docs/contrib/concepts/runtime.md` + - matching contrib doc(s): + - `docs/contrib/reference/python-api.md` + - `docs/contrib/reference/s3-and-codecs.md` + - `docs/contrib/reference/runtime-surfaces.md` + - `docs/contrib/architecture/execution-flow.md` + - `docs/contrib/architecture/supervisor-and-state.md` + +### Tests + +- Match: `tests/**` +- Read: + - docs corresponding to the code under test (same topic rules below) +- Also read: + - `docs/architecture/internal-modules.md` for internal test areas + - `CONTRIBUTING.md` for test layout and marker policy + +### Packaging / build / project config + +- Match: `pyproject.toml`, `uv.lock`, `CMakeLists.txt` +- Read: + - `README.md` + - `CONTRIBUTING.md` + - `c/README.md` (for C build changes) + +### Documentation edits + +- Match: `docs/**` +- Read: + - `docs/README.md` + +## Topic Rules (Apply In Addition To Path Rules) + +- If changing adapter behavior: + - `docs/concepts/execution.md` + - `docs/architecture/remote-protocol.md` + - `docs/contrib/concepts/runtime.md` + - `docs/contrib/reference/runtime-surfaces.md` + - `docs/contrib/architecture/supervisor-and-state.md` +- If changing data/object representation: + - `docs/concepts/dags-and-nodes.md` + - `docs/concepts/refs-and-namespaces.md` + - `docs/architecture/type-system.md` +- If changing codec behavior or literal write normalization: + - `docs/concepts/codecs-and-values.md` +- If changing storage, references, GC, or artifacts: + - `docs/concepts/storage.md` + - `docs/architecture/storage-internals.md` + - `docs/guides/store-and-load-external-data.md` +- If changing user-facing errors: + - `docs/reference/errors.md` + +## Ambiguity Rule + +If no rule clearly matches: + +- Read `docs/architecture/system-overview.md` and `docs/README.md`. +- Add or refine a mapping in this file in the same change. + +## Maintenance + +When adding a new top-level code area or major module, add or update a mapping here in the same PR. diff --git a/c/README.md b/c/README.md new file mode 100644 index 0000000..958cbc3 --- /dev/null +++ b/c/README.md @@ -0,0 +1,141 @@ +# C Core + +This document describes the current C implementation under `/c`. + +## File layout + +- `c/include/dml_db.h`: LMDB-backed database API. +- `c/include/dml_hash.h`: SHA-256 helper API. +- `c/include/dml_msgpack.h`: MessagePack encode/decode API. +- `c/include/dml_value.h`: `DmlValue` data model and helpers. +- `c/src/dml_db.c`: DB implementation. +- `c/src/dml_hash.c`: SHA-256 implementation wrapper. +- `c/src/dml_msgpack.c`: MessagePack implementation. +- `c/src/dml_value.c`: `DmlValue` implementation. +- `c/third_party`: vendored LMDB, msgpack-c, and SHA-256 sources. + +## Notes + +- Ref strings use `namespace:id` format (colon separator). +- This tree does not currently include `c/bindings/cpython/py_module.c`. +- The primary DB source file is `c/src/dml_db.c` (not `dml_core.c`). + +## `c/include/dml_db.h` + +### Types and constants + +- `DML_DB_ITER_LIMIT` +- `DmlDbHandle` +- `DmlDbTxn` +- `DmlObjCollection` +- DB error codes `DML_DB_OK` through `DML_DB_ERR_ENV_REOPENED` + +### Public API + +- `int dml_db_open(const char *path, const char *const *namespaces, size_t namespace_count, const int create_if_missing, size_t map_size, DmlDbHandle **out_handle)` +- `int dml_db_close(DmlDbHandle **p_handle)` +- `int dml_db_mapsize(DmlDbHandle **p_handle, size_t *out_mapsize)` +- `int dml_db_resize(DmlDbHandle **p_handle, size_t mapsize)` +- `int dml_db_txn_begin(DmlDbHandle **p_handle, const int readonly, DmlDbTxn **out_txn)` +- `int dml_db_txn_fin(DmlDbHandle **p_handle, DmlDbTxn *txn, const int commit)` +- `int dml_db_put(DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, size_t ns_len, const char *key, size_t key_len, const DmlValue *value, int no_overwrite, int raw, DmlValue **out_ref)` +- `int dml_db_get(DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, size_t ns_len, const char *key, size_t key_len, int raw, DmlValue **out_value)` +- `int dml_db_del(DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, size_t ns_len, const char *key, size_t key_len)` +- `int dml_db_exists(DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, size_t ns_len, const char *key, size_t key_len, int *out_exists)` +- `int dml_db_iter_keys(struct DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, const char *start_token, DmlObjCollection *out_page)` +- `void dml_db_free_obj_collection(DmlObjCollection *page)` +- `int dml_db_list_orphans(struct DmlDbHandle **p_handle, DmlDbTxn *txn, const char *const *start_refs, size_t start_refs_count, DmlValue **out_refs)` + +## `c/src/dml_db.c` + +### Internal structs + +- `struct DmlDbHandle` +- `struct DmlDbTxn` +- `DmlDumpEntry` +- `DmlDumpList` + +### Internal helpers + +- `static int dml_map_lmdb_rc(int rc)` +- `static void dml_dump_list_free(DmlDumpList *list)` +- `static int dml_dump_list_find(const DmlDumpList *list, const char *key, size_t key_len)` +- `static int dml_dump_list_add(DmlDumpList *list, const char *key, size_t key_len, DmlValue *value)` +- `static int dml_dump_add_ref(DmlDbHandle **p_handle, DmlDbTxn *txn, DmlDumpList *list, const char *key, size_t key_len)` +- `static int dml_dump_visit_value(DmlDbHandle **p_handle, DmlDbTxn *txn, DmlDumpList *list, const DmlValue *value)` +- `static int dml_db_reopen_handle(struct DmlDbHandle **p_handle)` +- `static int dml_db_validate(struct DmlDbHandle **p_handle, const int reopen)` +- `static int dml_db_validate_txn(struct DmlDbHandle **p_handle, struct DmlDbTxn *txn, const int reopen)` +- `int dml_ns_dbi_lookup(DmlDbHandle **p_handle, DmlDbTxn *txn, const char *ns, size_t ns_len, MDB_dbi *out_dbi)` + +### Exported functions + +- `int dml_db_open(...)` +- `int dml_db_close(...)` +- `int dml_db_mapsize(...)` +- `int dml_db_resize(...)` +- `int dml_db_txn_begin(...)` +- `int dml_db_txn_fin(...)` +- `int dml_db_put(...)` +- `int dml_db_get(...)` +- `int dml_db_del(...)` +- `int dml_db_exists(...)` +- `int dml_db_iter_keys(...)` +- `int dml_db_list_orphans(...)` +- `void dml_db_free_obj_collection(...)` + +## `c/include/dml_hash.h` and `c/src/dml_hash.c` + +- `int dml_hash_sha256_hex(const void *data, size_t len, char out[65])` + +## `c/include/dml_msgpack.h` and `c/src/dml_msgpack.c` + +### Types and constants + +- `DmlMsgpackBuffer` +- `DML_MSGPACK_OK` +- `DML_MSGPACK_ERR_INVALID` +- `DML_MSGPACK_ERR_NOMEM` +- `DML_MSGPACK_EXT_REF` + +### API + +- `int dml_msgpack_pack(const DmlValue *value, DmlMsgpackBuffer *out_buffer)` +- `int dml_msgpack_unpack(const char *data, size_t size, DmlValue **out_value)` +- `void dml_msgpack_free_buffer(void *data)` + +### Internal helpers + +- `static int dml_msgpack_entry_compare(const void *left, const void *right)` +- `static int dml_msgpack_pack_value(msgpack_packer *packer, const DmlValue *value)` +- `static DmlValue *dml_msgpack_from_object(const msgpack_object *obj)` + +## `c/include/dml_value.h` and `c/src/dml_value.c` + +### Types and constants + +- `DmlValueType` (`DML_VALUE_NULL`, `DML_VALUE_BOOL`, `DML_VALUE_INT`, `DML_VALUE_FLOAT`, `DML_VALUE_STR`, `DML_VALUE_LIST`, `DML_VALUE_MAP`, `DML_VALUE_REF`) +- `DmlValue` +- `DmlMapEntry` +- `DML_REF_ID_MAX` + +### API + +- `DmlValue *dml_value_new_null(void)` +- `DmlValue *dml_value_new_bool(int value)` +- `DmlValue *dml_value_new_int(long long value)` +- `DmlValue *dml_value_new_float(double value)` +- `DmlValue *dml_value_new_str(const char *data, size_t size)` +- `DmlValue *dml_value_new_ref(const char *data, size_t size)` +- `DmlValue *dml_value_new_list(size_t count)` +- `int dml_value_list_set(DmlValue *list, size_t index, DmlValue *item)` +- `DmlValue *dml_value_new_map(size_t count)` +- `int dml_value_map_set(DmlValue *map, size_t index, const char *key, size_t key_len, DmlValue *value)` +- `int dml_value_map_sort(DmlValue *map)` +- `void dml_value_free(DmlValue *value)` +- `int dml_ref_split(const char *ref, size_t ref_len, const char **namespace_str, size_t *namespace_len, const char **id_str, size_t *id_len)` + +### Internal helpers + +- `static int dml_value_map_entry_compare(const void *left, const void *right)` +- `static DmlValue *dml_value_alloc(DmlValueType type)` diff --git a/c/include/dml_db.h b/c/include/dml_db.h new file mode 100644 index 0000000..9107591 --- /dev/null +++ b/c/include/dml_db.h @@ -0,0 +1,118 @@ +#ifndef DAGGERML_DML_CORE2_H +#define DAGGERML_DML_CORE2_H + +#include + +#include "dml_value.h" + +#define DML_DB_ITER_LIMIT 64 + +typedef struct DmlDbHandle DmlDbHandle; +typedef struct DmlDbTxn DmlDbTxn; +typedef struct DmlObjCollection { + char *keys; // at most DML_DB_ITER_LIMIT keys concatenated together with null terminators + size_t *key_lens; // per-key lengths to support binary keys containing NUL bytes + DmlValue **values; // the values corresponding to the keys in the same order. + size_t count; + char *next_token; +} DmlObjCollection; + +enum { + DML_DB_OK = 0, + DML_DB_ERR_HANDLE_INVALID = -1, + DML_DB_ERR_HANDLE_CLOSED = -2, + DML_DB_ERR_HANDLE_FORKED = -3, + DML_DB_ERR_TXN_INVALID = -4, + DML_DB_ERR_TXN_READONLY = -5, + DML_DB_ERR_TXN_FORKED = -6, + DML_DB_ERR_INPUT_INVALID = -7, + DML_DB_ERR_TYPE_INVALID = -8, + DML_DB_ERR_PATH_INVALID = -9, + DML_DB_ERR_REF_INVALID = -10, + DML_DB_ERR_NAMESPACE_INVALID = -11, + DML_DB_ERR_NOT_FOUND = -12, + DML_DB_ERR_KEY_EXISTS = -13, + DML_DB_ERR_MSGPACK = -14, + DML_DB_ERR_NOMEM = -15, + DML_DB_ERR_MAP_FULL = -16, + DML_DB_ERR_BUSY = -17, + DML_DB_ERR_LMDB = -18, + DML_DB_ERR_INTERNAL = -19, + DML_DB_ERR_ENV_REOPENED = -20 +}; + +// Open a lmdb database (and optionally create if flag is set and does not exist) +int dml_db_open( + const char *path, + const char *const *namespaces, + size_t namespace_count, + const int create_if_missing, + size_t map_size, + DmlDbHandle **out_handle +); +int dml_db_close(DmlDbHandle **p_handle); + +int dml_db_mapsize(DmlDbHandle **p_handle, size_t *out_mapsize); +int dml_db_resize(DmlDbHandle **p_handle, size_t mapsize); + +int dml_db_txn_begin(DmlDbHandle **p_handle, const int readonly, DmlDbTxn **out_txn); +int dml_db_txn_fin(DmlDbHandle **p_handle, DmlDbTxn *txn, const int commit); + +int dml_db_put( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + const DmlValue *value, + int no_overwrite, + int raw, + DmlValue **out_ref +); +int dml_db_get( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int raw, + DmlValue **out_value +); +int dml_db_del( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len +); +int dml_db_exists( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int *out_exists +); + +int dml_db_iter_keys( + struct DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + const char *start_token, + DmlObjCollection *out_page +); +void dml_db_free_obj_collection(DmlObjCollection *page); + +int dml_db_list_orphans( + struct DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *const *start_refs, + size_t start_refs_count, + DmlValue **out_refs +); + +#endif diff --git a/c/include/dml_hash.h b/c/include/dml_hash.h new file mode 100644 index 0000000..93828bb --- /dev/null +++ b/c/include/dml_hash.h @@ -0,0 +1,16 @@ +#ifndef DAGGERML_DML_HASH_H +#define DAGGERML_DML_HASH_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int dml_hash_sha256_hex(const void *data, size_t len, char out[65]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/c/include/dml_msgpack.h b/c/include/dml_msgpack.h new file mode 100644 index 0000000..fea7301 --- /dev/null +++ b/c/include/dml_msgpack.h @@ -0,0 +1,35 @@ +#ifndef DAGGERML_DML_MSGPACK_H +#define DAGGERML_DML_MSGPACK_H + +#include + +#include "dml_value.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + void *data; + size_t size; +} DmlMsgpackBuffer; + +enum { + DML_MSGPACK_OK = 0, + DML_MSGPACK_ERR_INVALID = -1, + DML_MSGPACK_ERR_NOMEM = -2 +}; + +enum { + DML_MSGPACK_EXT_REF = 1 +}; + +int dml_msgpack_pack(const DmlValue *value, DmlMsgpackBuffer *out_buffer); +int dml_msgpack_unpack(const char *data, size_t size, DmlValue **out_value); +void dml_msgpack_free_buffer(void *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/c/include/dml_value.h b/c/include/dml_value.h new file mode 100644 index 0000000..0c84676 --- /dev/null +++ b/c/include/dml_value.h @@ -0,0 +1,83 @@ +#ifndef DAGGERML_DML_VALUE_H +#define DAGGERML_DML_VALUE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DML_VALUE_NULL = 0, + DML_VALUE_BOOL, + DML_VALUE_INT, + DML_VALUE_FLOAT, + DML_VALUE_STR, + DML_VALUE_LIST, + DML_VALUE_MAP, + DML_VALUE_REF +} DmlValueType; + +typedef struct DmlValue DmlValue; + +enum { + DML_REF_ID_MAX = 64 +}; + +typedef struct { + char *key; + size_t key_len; + DmlValue *value; +} DmlMapEntry; + +struct DmlValue { + DmlValueType type; + union { + int boolean; + long long integer; + double floating; + struct { + char *data; + size_t size; + } str; + struct { + DmlValue **items; + size_t count; + } list; + struct { + DmlMapEntry *entries; + size_t count; + } map; + struct { + char *data; + size_t size; + } ref; + } as; +}; + +DmlValue *dml_value_new_null(void); +DmlValue *dml_value_new_bool(int value); +DmlValue *dml_value_new_int(long long value); +DmlValue *dml_value_new_float(double value); +DmlValue *dml_value_new_str(const char *data, size_t size); +DmlValue *dml_value_new_ref(const char *data, size_t size); +DmlValue *dml_value_new_list(size_t count); +int dml_value_list_set(DmlValue *list, size_t index, DmlValue *item); +DmlValue *dml_value_new_map(size_t count); +int dml_value_map_set(DmlValue *map, size_t index, const char *key, size_t key_len, DmlValue *value); +int dml_value_map_sort(DmlValue *map); +void dml_value_free(DmlValue *value); +int dml_ref_split( + const char *ref, + size_t ref_len, + const char **namespace_str, + size_t *namespace_len, + const char **id_str, + size_t *id_len +); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/c/src/dml_db.c b/c/src/dml_db.c new file mode 100644 index 0000000..9e1a173 --- /dev/null +++ b/c/src/dml_db.c @@ -0,0 +1,1022 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../third_party/lmdb/libraries/liblmdb/lmdb.h" +#include "../third_party/msgpack/include/msgpack.h" +#include "../include/dml_db.h" +#include "../include/dml_hash.h" +#include "../include/dml_msgpack.h" + + +struct DmlDbHandle { + MDB_env *env; + char **namespaces; + char *path; + size_t namespace_count; + pid_t owner_pid; +}; + +struct DmlDbTxn { + MDB_txn *txn; + pthread_t owner_thread; + bool readonly; +}; + +typedef struct { + char *key; + size_t key_len; + void *value; + size_t value_len; +} DmlDumpEntry; + +typedef struct { + DmlDumpEntry *entries; + size_t count; + size_t capacity; +} DmlDumpList; + +static int dml_map_lmdb_rc(int rc) { + if (rc == MDB_MAP_FULL) { + return DML_DB_ERR_MAP_FULL; + } + if (rc == ENOMEM) { + return DML_DB_ERR_NOMEM; + } + if (rc == ENOENT || rc == ENOTDIR || rc == EACCES) { + return DML_DB_ERR_PATH_INVALID; + } + if (rc == MDB_KEYEXIST) { + return DML_DB_ERR_KEY_EXISTS; + } + if (rc == EBUSY || rc == EAGAIN) { + return DML_DB_ERR_BUSY; + } + return DML_DB_ERR_LMDB; +} + +static void dml_dump_list_free(DmlDumpList *list) { + if (list == NULL) return; + for (size_t i = 0; i < list->count; i++) { + free(list->entries[i].key); + if (list->entries[i].value != NULL) { + dml_value_free((DmlValue *)list->entries[i].value); + } + } + free(list->entries); + list->entries = NULL; + list->count = 0; + list->capacity = 0; +} + +static int dml_dump_list_find(const DmlDumpList *list, const char *key, size_t key_len) { + for (size_t i = 0; i < list->count; i++) { + if (list->entries[i].key_len == key_len && + memcmp(list->entries[i].key, key, key_len) == 0) { + return (int)i; + } + } + return -1; +} + +static int dml_dump_list_add(DmlDumpList *list, const char *key, size_t key_len, DmlValue *value) { + if (list->count == list->capacity) { + size_t next = list->capacity == 0 ? 8 : list->capacity * 2; + DmlDumpEntry *next_entries = (DmlDumpEntry *)realloc(list->entries, next * sizeof(*next_entries)); + if (next_entries == NULL) { + return DML_DB_ERR_NOMEM; + } + list->entries = next_entries; + list->capacity = next; + } + char *key_copy = (char *)malloc(key_len); + if (key_copy == NULL) { + return DML_DB_ERR_NOMEM; + } + memcpy(key_copy, key, key_len); + list->entries[list->count].key = key_copy; + list->entries[list->count].key_len = key_len; + list->entries[list->count].value = value; + list->entries[list->count].value_len = 0; + list->count += 1; + return 0; +} + +static int dml_dump_visit_value( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + DmlDumpList *list, + const DmlValue *value +); + +static int dml_dump_add_ref( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + DmlDumpList *list, + const char *key, + size_t key_len +) { + const char *ns = NULL; + const char *ident = NULL; + size_t ns_len = 0; + size_t id_len = 0; + DmlValue *value = NULL; + int rc; + + if (dml_dump_list_find(list, key, key_len) >= 0) { + return 0; + } + if (dml_ref_split(key, key_len, &ns, &ns_len, &ident, &id_len) != 0) { + return DML_DB_ERR_REF_INVALID; + } + rc = dml_db_get(p_handle, txn, ns, ns_len, ident, id_len, 0, &value); + if (rc != 0) { + return rc; + } + rc = dml_dump_list_add(list, key, key_len, value); + if (rc != 0) { + dml_value_free(value); + return rc; + } + rc = dml_dump_visit_value(p_handle, txn, list, value); + if (rc != 0) { + return rc; + } + return 0; +} + +static int dml_dump_visit_value( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + DmlDumpList *list, + const DmlValue *value +) { + if (value == NULL) return 0; + switch (value->type) { + case DML_VALUE_REF: + return dml_dump_add_ref(p_handle, txn, list, value->as.ref.data, value->as.ref.size); + case DML_VALUE_LIST: + for (size_t i = 0; i < value->as.list.count; i++) { + int rc = dml_dump_visit_value(p_handle, txn, list, value->as.list.items[i]); + if (rc != 0) return rc; + } + return 0; + case DML_VALUE_MAP: + for (size_t i = 0; i < value->as.map.count; i++) { + int rc = dml_dump_visit_value(p_handle, txn, list, value->as.map.entries[i].value); + if (rc != 0) return rc; + } + return 0; + default: + return 0; + } +} + +static int dml_db_reopen_handle(struct DmlDbHandle **p_handle) { + if (p_handle == NULL || *p_handle == NULL) return DML_DB_ERR_HANDLE_INVALID; + struct DmlDbHandle *handle = *p_handle; + // Copy args needed for reopen + char *path_copy = strdup(handle->path); + if (!path_copy) return DML_DB_ERR_NOMEM; + size_t ns_count = handle->namespace_count; + char **ns_copy = NULL; + if (ns_count > 0) { + ns_copy = (char **)calloc(ns_count, sizeof(char *)); + if (!ns_copy) { + free(path_copy); + return DML_DB_ERR_NOMEM; + } + for (size_t i = 0; i < ns_count; i++) { + ns_copy[i] = strdup(handle->namespaces[i]); + if (!ns_copy[i]) { + for (size_t j = 0; j < i; j++) free(ns_copy[j]); + free(ns_copy); + free(path_copy); + return DML_DB_ERR_NOMEM; + } + } + } + // Close old handle FIRST to avoid "two handles in one process" issue + dml_db_close(p_handle); + struct DmlDbHandle *new_handle = NULL; + int rc = dml_db_open( + path_copy, + (const char *const *)ns_copy, + ns_count, + 0, // create_if_missing: assumed irrelevant for reopen of existing + 0, // map_size: 0 means keep existing + &new_handle + ); + // Cleanup copies + free(path_copy); + if (ns_copy) { + for (size_t i = 0; i < ns_count; i++) free(ns_copy[i]); + free(ns_copy); + } + if (rc != 0) { return rc; } + *p_handle = new_handle; + return 0; +} + +static int dml_db_validate(struct DmlDbHandle **p_handle, const int reopen) { + if (p_handle == NULL || *p_handle == NULL) return DML_DB_ERR_HANDLE_INVALID; + MDB_stat st; + int rc; + int env_was_reopened = 0; + struct DmlDbHandle *handle = *p_handle; + if (handle->owner_pid != getpid()) { + if (reopen) { + int rc = dml_db_reopen_handle(p_handle); + if (rc != 0) return rc; + handle = *p_handle; + env_was_reopened = 1; + } else { + return DML_DB_ERR_HANDLE_FORKED; + } + } + if (handle->env == NULL) return DML_DB_ERR_HANDLE_CLOSED; + if (handle->namespace_count == 0) return DML_DB_ERR_HANDLE_INVALID; + rc = mdb_env_stat((*p_handle)->env, &st); + if (rc != MDB_SUCCESS) { + // If environment is invalid (e.g., another handle to same DB was closed), + // reopen if requested + if (rc == EINVAL && reopen) { + int rc = dml_db_reopen_handle(p_handle); + if (rc != 0) return rc; + handle = *p_handle; + env_was_reopened = 1; + // Verify the new handle works + rc = mdb_env_stat(handle->env, &st); + if (rc != MDB_SUCCESS) return dml_map_lmdb_rc(rc); + } else { + return dml_map_lmdb_rc(rc); + } + } + // If we reopened the environment, all existing transactions are now invalid + // Signal to caller that they need to retry their transaction + if (env_was_reopened) { + return DML_DB_ERR_ENV_REOPENED; + } + return 0; +} + +static int dml_db_validate_txn(struct DmlDbHandle **p_handle, struct DmlDbTxn *txn, const int reopen) { + int rc; + // If txn is NULL, allow reopening. If txn exists, don't reopen (would invalidate txn) + rc = dml_db_validate(p_handle, txn == NULL ? reopen : 0); + if (rc != 0) { + if (rc == DML_DB_ERR_HANDLE_FORKED) { + return DML_DB_ERR_TXN_FORKED; + } + // ENV_REOPENED should not occur when txn != NULL (since we pass reopen=0) + // but if it does, it means the handle was reopened elsewhere - propagate it + return rc; + } + if (txn == NULL || txn->txn == NULL) { + return DML_DB_ERR_TXN_INVALID; + } + if (pthread_self() != txn->owner_thread) { + return DML_DB_ERR_TXN_FORKED; + } + return 0; +} + +// dml_db_open opens a lmdb database (and optionally create if flag is set and does not exist) +// we own the handle +int dml_db_open( + const char *path, + const char *const *namespaces, + size_t namespace_count, + const int create_if_missing, + size_t map_size, + DmlDbHandle **out_handle +) { + DmlDbHandle *handle = NULL; + int rc; + if (path == NULL || out_handle == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + if (namespace_count == 0 || namespaces == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + handle = (DmlDbHandle *)calloc(1, sizeof(DmlDbHandle)); + if (handle == NULL) { + return DML_DB_ERR_NOMEM; + } + rc = mdb_env_create(&handle->env); + if (rc != MDB_SUCCESS) { + dml_db_close(&handle); + return dml_map_lmdb_rc(rc); + } + rc = mdb_env_set_maxdbs(handle->env, (unsigned int)namespace_count); + if (rc != MDB_SUCCESS) { + dml_db_close(&handle); + return dml_map_lmdb_rc(rc); + } + if (map_size > 0) { + rc = mdb_env_set_mapsize(handle->env, map_size); + if (rc != MDB_SUCCESS) { + dml_db_close(&handle); + return dml_map_lmdb_rc(rc); + } + } + rc = mdb_env_open(handle->env, path, create_if_missing ? MDB_CREATE : 0, 0664); + if (rc != MDB_SUCCESS) { + dml_db_close(&handle); + return dml_map_lmdb_rc(rc); + } + handle->owner_pid = getpid(); + // Copy namespaces + if (namespaces != NULL && namespace_count > 0) { + size_t i; + handle->namespaces = (char **)calloc(namespace_count, sizeof(char *)); + if (handle->namespaces == NULL) { + dml_db_close(&handle); + return DML_DB_ERR_NOMEM; + } + handle->namespace_count = namespace_count; + for (i = 0; i < namespace_count; i++) { + if (namespaces[i] == NULL) { + dml_db_close(&handle); + return DML_DB_ERR_INPUT_INVALID; + } + size_t len = strlen(namespaces[i]); + handle->namespaces[i] = (char *)malloc(len + 1); + if (handle->namespaces[i] == NULL) { + dml_db_close(&handle); + return DML_DB_ERR_NOMEM; + } + memcpy(handle->namespaces[i], namespaces[i], len); + handle->namespaces[i][len] = '\0'; + } + } + handle->path = strdup(path); + *out_handle = handle; + return 0; +} + +int dml_db_close(struct DmlDbHandle **p_handle) { + if (p_handle == NULL || *p_handle == NULL) return 0; + struct DmlDbHandle *handle = *p_handle; + *p_handle = NULL; + if (handle->namespaces != NULL) { + for (size_t i = 0; i < handle->namespace_count; i++) { + free(handle->namespaces[i]); // ok if NULL + } + free(handle->namespaces); + handle->namespaces = NULL; + } + if (handle->path != NULL) { + free(handle->path); + handle->path = NULL; + } + if (handle->env != NULL) mdb_env_close(handle->env); + free(handle); + return 0; +} + +int dml_db_mapsize(struct DmlDbHandle **p_handle, size_t *out_mapsize) { + MDB_envinfo info; + int rc; + if (out_mapsize == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + *out_mapsize = 0; + rc = dml_db_validate(p_handle, 1); + if (rc != 0) { + return rc; + } + memset(&info, 0, sizeof(info)); + struct DmlDbHandle *handle = *p_handle; + rc = mdb_env_info(handle->env, &info); + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + *out_mapsize = info.me_mapsize; + return 0; +} +int dml_db_resize(struct DmlDbHandle **p_handle, size_t mapsize) { + int rc; + rc = dml_db_validate(p_handle, 1); + if (rc != 0) { + return rc; + } + rc = mdb_env_set_mapsize((*p_handle)->env, mapsize); + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + return 0; +} + +// transactions +int dml_db_txn_begin(DmlDbHandle **p_handle, const int readonly, DmlDbTxn **out_txn) { + MDB_txn *txn = NULL; + DmlDbTxn *wrapper = NULL; + int rc; + rc = dml_db_validate(p_handle, 1); + // ENV_REOPENED means the env was repaired and is ready for new transactions + if (rc != 0 && rc != DML_DB_ERR_ENV_REOPENED) { + return rc; + } + rc = mdb_txn_begin((*p_handle)->env, NULL, readonly ? MDB_RDONLY : 0, &txn); + // If we still get EINVAL after validate, try reopening one more time + if (rc == EINVAL) { + int rc2 = dml_db_reopen_handle(p_handle); + if (rc2 != 0) { + return dml_map_lmdb_rc(rc); + } + // Retry with the new handle + rc = mdb_txn_begin((*p_handle)->env, NULL, readonly ? MDB_RDONLY : 0, &txn); + } + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + wrapper = (DmlDbTxn *)calloc(1, sizeof(*wrapper)); + if (wrapper == NULL) { + mdb_txn_abort(txn); + return DML_DB_ERR_NOMEM; + } + wrapper->txn = txn; + wrapper->owner_thread = pthread_self(); + wrapper->readonly = readonly ? true : false; + *out_txn = wrapper; + return 0; +} +int dml_db_txn_fin(DmlDbHandle **p_handle, DmlDbTxn *txn, const int commit) { + int rc = 0; + rc = dml_db_validate(p_handle, 0); + if (rc != 0) { + if (rc == DML_DB_ERR_HANDLE_FORKED) { + return DML_DB_ERR_TXN_FORKED; + } + return rc; + } + if (txn == NULL) { + return rc; + } + + // Prevent double-free by checking if txn is already freed + if (txn->txn != NULL) { + if (txn->readonly) { + mdb_txn_abort(txn->txn); + rc = 0; + } else if (commit) { + rc = mdb_txn_commit(txn->txn); + if (rc != MDB_SUCCESS) { + // LMDB aborts and frees the txn on commit failure. + rc = dml_map_lmdb_rc(rc); + } else { + rc = 0; + } + } else { + mdb_txn_abort(txn->txn); + } + // Mark as freed to prevent double-free + txn->txn = NULL; + } + txn->owner_thread = 0; + // Only free if not already freed + free(txn); + // Set txn to NULL to avoid dangling pointer + txn = NULL; + return rc; +} + +// io +int dml_ns_dbi_lookup( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + MDB_dbi *out_dbi +) { + // look for namespace in handle and if none found, create it + size_t i; + int rc; + unsigned int flags = 0; + rc = dml_db_validate_txn(p_handle, txn, 1); + if (rc != 0) return rc; + if (!txn->readonly) flags |= MDB_CREATE; + if (ns == NULL || ns_len == 0 || out_dbi == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + DmlDbHandle *handle = *p_handle; + for (i = 0; i < handle->namespace_count; i++) { + if (strlen(handle->namespaces[i]) == ns_len && + memcmp(handle->namespaces[i], ns, ns_len) == 0) { + // found + rc = mdb_dbi_open(txn->txn, handle->namespaces[i], flags, out_dbi); + if (rc == MDB_NOTFOUND) { + return DML_DB_ERR_NOT_FOUND; + } + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + return 0; + } + } + return DML_DB_ERR_NAMESPACE_INVALID; +} +int dml_db_put( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + const DmlValue *value, + int no_overwrite, + int raw, + DmlValue **out_ref +) { + int rc = 0; + MDB_dbi dbi; + DmlDbTxn *local_txn = txn; + DmlMsgpackBuffer buffer = {0}; + + char *owned_key = NULL; // <— if we compute it, we own it + size_t owned_key_len = 0; + MDB_val db_value = {0}; + + if (ns == NULL || ns_len == 0) return DML_DB_ERR_INPUT_INVALID; + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + if (local_txn->readonly) { rc = DML_DB_ERR_TXN_READONLY; goto cleanup; } + if (raw) { + // Raw mode: value should be a string DmlValue containing raw bytes + if (value->type != DML_VALUE_STR) { + rc = DML_DB_ERR_INPUT_INVALID; + goto cleanup; + } + // Store raw bytes directly - no msgpack packing + db_value.mv_data = value->as.str.data; + db_value.mv_size = value->as.str.size; + } else { + rc = dml_msgpack_pack(value, &buffer); + if (rc != 0) { rc = DML_DB_ERR_MSGPACK; goto cleanup; } + db_value.mv_data = buffer.data; + db_value.mv_size = buffer.size; + } + if (key == NULL || key_len == 0) { + char hex[65]; + if (raw) { + // For raw mode, hash the raw data directly + if (dml_hash_sha256_hex(db_value.mv_data, db_value.mv_size, hex) != 0) { + rc = DML_DB_ERR_INTERNAL; + goto cleanup; + } + } else { + // For normal mode, hash the buffer data + if (dml_hash_sha256_hex(buffer.data, buffer.size, hex) != 0) { + rc = DML_DB_ERR_INTERNAL; + goto cleanup; + } + } + owned_key_len = strlen(hex); + owned_key = (char *)malloc(owned_key_len); + if (!owned_key) { rc = DML_DB_ERR_NOMEM; goto cleanup; } + memcpy(owned_key, hex, owned_key_len); + key = owned_key; + key_len = owned_key_len; + } + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, ns_len, &dbi); + if (rc != 0) { goto cleanup; } + { + MDB_val db_key = { .mv_size = key_len, .mv_data = (void *)key }; + unsigned int flags = no_overwrite ? MDB_NOOVERWRITE : 0; + + rc = mdb_put(local_txn->txn, dbi, &db_key, &db_value, flags); + if (rc == MDB_KEYEXIST && no_overwrite) { + rc = 0; + } + if (rc != MDB_SUCCESS) { + rc = dml_map_lmdb_rc(rc); + goto cleanup; + } + } + if (out_ref != NULL) { + size_t ref_len = ns_len + 1 + key_len; + char *ref_data = (char *)malloc(ref_len); + if (!ref_data) { rc = DML_DB_ERR_NOMEM; goto cleanup; } + + memcpy(ref_data, ns, ns_len); + ref_data[ns_len] = ':'; + memcpy(ref_data + ns_len + 1, key, key_len); + + DmlValue *ref_value = dml_value_new_ref(ref_data, ref_len); + free(ref_data); + + if (!ref_value) { rc = DML_DB_ERR_NOMEM; goto cleanup; } + *out_ref = ref_value; + } +cleanup: + if (buffer.data) dml_msgpack_free_buffer(buffer.data); + free(owned_key); + return rc; +} +int dml_db_get( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int raw, + DmlValue **out_value +) { + int rc = 0; + MDB_dbi dbi; + MDB_val db_key; + MDB_val db_value; + DmlDbTxn *local_txn = txn; + + if (key == NULL || key_len == 0) return DML_DB_ERR_INPUT_INVALID; + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + // lookup namespace + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, ns_len, &dbi); + if (rc != 0) return rc; + // get value + db_key.mv_size = key_len; + db_key.mv_data = (void *)key; + rc = mdb_get(local_txn->txn, dbi, &db_key, &db_value); + if (rc == MDB_NOTFOUND) { + return DML_DB_ERR_NOT_FOUND; + } + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + if (raw) { + // Return raw bytes as a string DmlValue + *out_value = dml_value_new_str(db_value.mv_data, db_value.mv_size); + if (*out_value == NULL) { + return DML_DB_ERR_NOMEM; + } + } else { + rc = dml_msgpack_unpack(db_value.mv_data, db_value.mv_size, out_value); + if (rc != 0) { + return DML_DB_ERR_MSGPACK; + } + } + return 0; +} + +int dml_db_del( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len +) { + int rc = 0; + MDB_dbi dbi; + MDB_val db_key; + DmlDbTxn *local_txn = txn; + + if (key == NULL || key_len == 0) return DML_DB_ERR_INPUT_INVALID; + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + if (local_txn->readonly) return DML_DB_ERR_TXN_READONLY; + + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, ns_len, &dbi); + if (rc != 0) return rc; + + db_key.mv_size = key_len; + db_key.mv_data = (void *)key; + rc = mdb_del(local_txn->txn, dbi, &db_key, NULL); + if (rc == MDB_NOTFOUND) return DML_DB_ERR_NOT_FOUND; + if (rc != MDB_SUCCESS) return dml_map_lmdb_rc(rc); + return 0; +} + +int dml_db_exists( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int *out_exists +) { + int rc = 0; + MDB_dbi dbi; + MDB_val db_key; + MDB_val db_value; + DmlDbTxn *local_txn = txn; + + if (out_exists == NULL) return DML_DB_ERR_INPUT_INVALID; + *out_exists = 0; + if (key == NULL || key_len == 0) return DML_DB_ERR_INPUT_INVALID; + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, ns_len, &dbi); + if (rc != 0) { + if (rc == DML_DB_ERR_NOT_FOUND) { + *out_exists = 0; + rc = 0; + } + return rc; + } + db_key.mv_size = key_len; + db_key.mv_data = (void *)key; + rc = mdb_get(local_txn->txn, dbi, &db_key, &db_value); + if (rc == MDB_NOTFOUND) { + *out_exists = 0; + return 0; + } + if (rc != MDB_SUCCESS) { + return dml_map_lmdb_rc(rc); + } + *out_exists = 1; + return 0; +} + +int dml_db_iter_keys( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + const char *start_token, + DmlObjCollection *out_page +) { + int rc = 0; + MDB_dbi dbi; + MDB_cursor *cursor = NULL; + MDB_val db_key; + MDB_val db_value; + DmlDbTxn *local_txn = txn; + size_t count = 0; + size_t keys_len = 0; + size_t keys_cap = 0; + char *keys = NULL; + size_t *key_lens = NULL; + DmlValue **values = NULL; + char *next_token = NULL; + + if (out_page == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + out_page->keys = NULL; + out_page->values = NULL; + out_page->count = 0; + out_page->next_token = NULL; + + if (ns == NULL || ns[0] == '\0') return DML_DB_ERR_INPUT_INVALID; + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, strlen(ns), &dbi); + if (rc != 0) { + goto cleanup; + } + rc = mdb_cursor_open(local_txn->txn, dbi, &cursor); + if (rc != MDB_SUCCESS) { + rc = dml_map_lmdb_rc(rc); + goto cleanup; + } + if (start_token != NULL && start_token[0] != '\0') { + db_key.mv_data = (void *)start_token; + db_key.mv_size = strlen(start_token); + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_SET_RANGE); + } else { + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_FIRST); + } + if (rc != MDB_SUCCESS) { + rc = (rc == MDB_NOTFOUND) ? 0 : dml_map_lmdb_rc(rc); + goto cleanup; + } + values = (DmlValue **)calloc(DML_DB_ITER_LIMIT, sizeof(*values)); + key_lens = (size_t *)calloc(DML_DB_ITER_LIMIT, sizeof(*key_lens)); + if (values == NULL || key_lens == NULL) { + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + while (rc == MDB_SUCCESS && count < DML_DB_ITER_LIMIT) { + size_t key_len = db_key.mv_size; + size_t needed = keys_len + key_len + 1; + DmlValue *value = NULL; + + if (needed > keys_cap) { + size_t next_cap = keys_cap == 0 ? 128 : keys_cap * 2; + if (next_cap < needed) next_cap = needed; + char *next_keys = (char *)realloc(keys, next_cap); + if (next_keys == NULL) { + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + keys = next_keys; + keys_cap = next_cap; + } + if (dml_msgpack_unpack(db_value.mv_data, db_value.mv_size, &value) != 0 || value == NULL) { + rc = DML_DB_ERR_MSGPACK; + goto cleanup; + } + values[count] = value; + key_lens[count] = key_len; + memcpy(keys + keys_len, db_key.mv_data, key_len); + keys_len += key_len; + keys[keys_len] = '\0'; + keys_len += 1; + count += 1; + if (count >= DML_DB_ITER_LIMIT) { + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_NEXT); + if (rc == MDB_SUCCESS) { + size_t token_len = db_key.mv_size; + next_token = (char *)malloc(token_len + 1); + if (next_token == NULL) { + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + memcpy(next_token, db_key.mv_data, token_len); + next_token[token_len] = '\0'; + } else if (rc == MDB_NOTFOUND) { + rc = 0; + } else { + rc = dml_map_lmdb_rc(rc); + } + break; + } + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_NEXT); + if (rc == MDB_NOTFOUND) { + rc = 0; + break; + } + if (rc != MDB_SUCCESS) { + rc = dml_map_lmdb_rc(rc); + break; + } + } + if (rc != 0) { + goto cleanup; + } + out_page->keys = keys; + out_page->key_lens = key_lens; + out_page->values = values; + out_page->count = count; + out_page->next_token = next_token; + keys = NULL; + key_lens = NULL; + values = NULL; + next_token = NULL; + +cleanup: + if (cursor != NULL) mdb_cursor_close(cursor); + if (values != NULL) { + for (size_t i = 0; i < count; i++) { + if (values[i] != NULL) { + dml_value_free(values[i]); + } + } + free(values); + } + free(keys); + free(key_lens); + free(next_token); + return rc; +} + +int dml_db_list_orphans( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *const *start_refs, + size_t start_refs_count, + DmlValue **out_refs +) { + int rc = 0; + DmlDbTxn *local_txn = txn; + DmlDumpList reachable = {0}; + DmlDumpList orphans = {0}; + MDB_dbi dbi; + MDB_cursor *cursor = NULL; + MDB_val db_key; + MDB_val db_value; + + if (out_refs == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + *out_refs = NULL; + if (start_refs_count > 0 && start_refs == NULL) { + return DML_DB_ERR_INPUT_INVALID; + } + rc = dml_db_validate_txn(p_handle, txn, 0); + if (rc != 0) return rc; + + for (size_t i = 0; i < start_refs_count; i++) { + if (start_refs[i] == NULL) { + rc = DML_DB_ERR_INPUT_INVALID; + goto cleanup; + } + size_t ref_len = strlen(start_refs[i]); + rc = dml_dump_add_ref(p_handle, local_txn, &reachable, start_refs[i], ref_len); + if (rc != 0) { + goto cleanup; + } + } + + DmlDbHandle *handle = *p_handle; + for (size_t i = 0; i < handle->namespace_count; i++) { + const char *ns = handle->namespaces[i]; + size_t ns_len = strlen(ns); + + rc = dml_ns_dbi_lookup(p_handle, local_txn, ns, ns_len, &dbi); + if (rc == DML_DB_ERR_NOT_FOUND) { + rc = 0; + continue; + } + if (rc != 0) { + goto cleanup; + } + rc = mdb_cursor_open(local_txn->txn, dbi, &cursor); + if (rc != MDB_SUCCESS) { + rc = dml_map_lmdb_rc(rc); + goto cleanup; + } + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_FIRST); + if (rc == MDB_NOTFOUND) { + mdb_cursor_close(cursor); + cursor = NULL; + rc = 0; + continue; + } + if (rc != MDB_SUCCESS) { + rc = dml_map_lmdb_rc(rc); + goto cleanup; + } + while (rc == MDB_SUCCESS) { + size_t ref_len = ns_len + 1 + db_key.mv_size; + char *ref_data = (char *)malloc(ref_len); + if (ref_data == NULL) { + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + memcpy(ref_data, ns, ns_len); + ref_data[ns_len] = ':'; + memcpy(ref_data + ns_len + 1, db_key.mv_data, db_key.mv_size); + if (dml_dump_list_find(&reachable, ref_data, ref_len) < 0) { + rc = dml_dump_list_add(&orphans, ref_data, ref_len, NULL); + free(ref_data); + if (rc != 0) goto cleanup; + } else { + free(ref_data); + } + rc = mdb_cursor_get(cursor, &db_key, &db_value, MDB_NEXT); + } + if (rc == MDB_NOTFOUND) { + rc = 0; + } else if (rc != 0) { + rc = dml_map_lmdb_rc(rc); + } + mdb_cursor_close(cursor); + cursor = NULL; + if (rc != 0) goto cleanup; + } + + DmlValue *result = dml_value_new_list(orphans.count); + if (result == NULL) { + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + for (size_t i = 0; i < orphans.count; i++) { + DmlDumpEntry *entry = &orphans.entries[i]; + DmlValue *ref_val = dml_value_new_ref(entry->key, entry->key_len); + if (ref_val == NULL || dml_value_list_set(result, i, ref_val) != 0) { + dml_value_free(ref_val); + dml_value_free(result); + rc = DML_DB_ERR_NOMEM; + goto cleanup; + } + } + *out_refs = result; + result = NULL; + +cleanup: + if (cursor != NULL) mdb_cursor_close(cursor); + dml_dump_list_free(&reachable); + dml_dump_list_free(&orphans); + return rc; +} + +void dml_db_free_obj_collection(DmlObjCollection *page) { + if (page == NULL) return; + if (page->values != NULL) { + for (size_t i = 0; i < page->count; i++) { + if (page->values[i] != NULL) { + dml_value_free(page->values[i]); + } + } + free(page->values); + page->values = NULL; + } + free(page->keys); + page->keys = NULL; + free(page->key_lens); + page->key_lens = NULL; + free(page->next_token); + page->next_token = NULL; + page->count = 0; +} diff --git a/c/src/dml_hash.c b/c/src/dml_hash.c new file mode 100644 index 0000000..150905c --- /dev/null +++ b/c/src/dml_hash.c @@ -0,0 +1,28 @@ +#include +#include +#include + +#include "../third_party/sha256/sha256.h" +#include "../include/dml_hash.h" + +int +dml_hash_sha256_hex(const void *data, size_t len, char out[65]) +{ + SHA256_CTX ctx; + uint8_t hash[SHA256_BLOCK_SIZE]; + size_t i; + + if (out == NULL) { + return -1; + } + + sha256_init(&ctx); + sha256_update(&ctx, (const uint8_t *)data, len); + sha256_final(&ctx, hash); + + for (i = 0; i < SHA256_BLOCK_SIZE; i++) { + snprintf(&out[i * 2], 3, "%02x", hash[i]); + } + out[64] = '\0'; + return 0; +} diff --git a/c/src/dml_msgpack.c b/c/src/dml_msgpack.c new file mode 100644 index 0000000..098a5d3 --- /dev/null +++ b/c/src/dml_msgpack.c @@ -0,0 +1,253 @@ +#include +#include + +#include "../third_party/msgpack/include/msgpack.h" +#include "../include/dml_msgpack.h" + +static int +dml_msgpack_entry_compare(const void *left, const void *right) +{ + const DmlMapEntry *a = *(const DmlMapEntry * const *)left; + const DmlMapEntry *b = *(const DmlMapEntry * const *)right; + size_t min_len = a->key_len < b->key_len ? a->key_len : b->key_len; + int cmp = 0; + + if (min_len > 0) { + cmp = memcmp(a->key, b->key, min_len); + } + if (cmp != 0) { + return cmp; + } + if (a->key_len < b->key_len) { + return -1; + } + if (a->key_len > b->key_len) { + return 1; + } + return 0; +} + +static int +dml_msgpack_pack_value(msgpack_packer *packer, const DmlValue *value) +{ + size_t i; + + if (value == NULL) { + return DML_MSGPACK_ERR_INVALID; + } + + switch (value->type) { + case DML_VALUE_NULL: + return msgpack_pack_nil(packer); + case DML_VALUE_BOOL: + if (value->as.boolean) { + return msgpack_pack_true(packer); + } + return msgpack_pack_false(packer); + case DML_VALUE_INT: + return msgpack_pack_long_long(packer, value->as.integer); + case DML_VALUE_FLOAT: + return msgpack_pack_double(packer, value->as.floating); + case DML_VALUE_STR: + if (msgpack_pack_str(packer, value->as.str.size) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + return msgpack_pack_str_body(packer, value->as.str.data, value->as.str.size); + case DML_VALUE_LIST: + if (msgpack_pack_array(packer, value->as.list.count) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + for (i = 0; i < value->as.list.count; i++) { + if (dml_msgpack_pack_value(packer, value->as.list.items[i]) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + } + return 0; + case DML_VALUE_MAP: + if (msgpack_pack_map(packer, value->as.map.count) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + if (value->as.map.count > 1) { + DmlMapEntry **sorted = (DmlMapEntry **)calloc(value->as.map.count, sizeof(DmlMapEntry *)); + if (sorted == NULL) { + return DML_MSGPACK_ERR_NOMEM; + } + for (i = 0; i < value->as.map.count; i++) { + sorted[i] = &value->as.map.entries[i]; + } + qsort(sorted, value->as.map.count, sizeof(DmlMapEntry *), dml_msgpack_entry_compare); + for (i = 0; i < value->as.map.count; i++) { + DmlMapEntry *entry = sorted[i]; + if (msgpack_pack_str(packer, entry->key_len) != 0) { + free(sorted); + return DML_MSGPACK_ERR_INVALID; + } + if (msgpack_pack_str_body(packer, entry->key, entry->key_len) != 0) { + free(sorted); + return DML_MSGPACK_ERR_INVALID; + } + if (dml_msgpack_pack_value(packer, entry->value) != 0) { + free(sorted); + return DML_MSGPACK_ERR_INVALID; + } + } + free(sorted); + return 0; + } + for (i = 0; i < value->as.map.count; i++) { + DmlMapEntry *entry = &value->as.map.entries[i]; + if (msgpack_pack_str(packer, entry->key_len) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + if (msgpack_pack_str_body(packer, entry->key, entry->key_len) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + if (dml_msgpack_pack_value(packer, entry->value) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + } + return 0; + case DML_VALUE_REF: + if (msgpack_pack_ext(packer, value->as.ref.size, DML_MSGPACK_EXT_REF) != 0) { + return DML_MSGPACK_ERR_INVALID; + } + if (value->as.ref.size == 0) { + return 0; + } + return msgpack_pack_ext_body(packer, value->as.ref.data, value->as.ref.size); + default: + return DML_MSGPACK_ERR_INVALID; + } +} + +int +dml_msgpack_pack(const DmlValue *value, DmlMsgpackBuffer *out_buffer) +{ + msgpack_sbuffer buffer; + msgpack_packer packer; + int rc; + + if (out_buffer == NULL) { + return DML_MSGPACK_ERR_INVALID; + } + + msgpack_sbuffer_init(&buffer); + msgpack_packer_init(&packer, &buffer, msgpack_sbuffer_write); + + rc = dml_msgpack_pack_value(&packer, value); + if (rc != 0) { + msgpack_sbuffer_destroy(&buffer); + return DML_MSGPACK_ERR_INVALID; + } + + out_buffer->data = buffer.data; + out_buffer->size = buffer.size; + buffer.data = NULL; + buffer.size = 0; + buffer.alloc = 0; + msgpack_sbuffer_destroy(&buffer); + return DML_MSGPACK_OK; +} + +static DmlValue * +dml_msgpack_from_object(const msgpack_object *obj) +{ + DmlValue *result = NULL; + size_t i; + + switch (obj->type) { + case MSGPACK_OBJECT_NIL: + return dml_value_new_null(); + case MSGPACK_OBJECT_BOOLEAN: + return dml_value_new_bool(obj->via.boolean ? 1 : 0); + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return dml_value_new_int((long long)obj->via.u64); + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return dml_value_new_int((long long)obj->via.i64); + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return dml_value_new_float(obj->via.f64); + case MSGPACK_OBJECT_STR: + return dml_value_new_str(obj->via.str.ptr, obj->via.str.size); + case MSGPACK_OBJECT_EXT: + if (obj->via.ext.type == DML_MSGPACK_EXT_REF) { + return dml_value_new_ref(obj->via.ext.ptr, obj->via.ext.size); + } + return NULL; + case MSGPACK_OBJECT_ARRAY: + result = dml_value_new_list(obj->via.array.size); + if (result == NULL) { + return NULL; + } + for (i = 0; i < obj->via.array.size; i++) { + DmlValue *item = dml_msgpack_from_object(&obj->via.array.ptr[i]); + if (item == NULL || dml_value_list_set(result, i, item) != 0) { + dml_value_free(item); + dml_value_free(result); + return NULL; + } + } + return result; + case MSGPACK_OBJECT_MAP: + result = dml_value_new_map(obj->via.map.size); + if (result == NULL) { + return NULL; + } + for (i = 0; i < obj->via.map.size; i++) { + const msgpack_object_kv *kv = &obj->via.map.ptr[i]; + if (kv->key.type != MSGPACK_OBJECT_STR) { + dml_value_free(result); + return NULL; + } + DmlValue *item = dml_msgpack_from_object(&kv->val); + if (item == NULL || + dml_value_map_set(result, i, kv->key.via.str.ptr, kv->key.via.str.size, item) != 0) { + dml_value_free(item); + dml_value_free(result); + return NULL; + } + } + if (dml_value_map_sort(result) != 0) { + dml_value_free(result); + return NULL; + } + return result; + default: + return NULL; + } +} + +int +dml_msgpack_unpack(const char *data, size_t size, DmlValue **out_value) +{ + msgpack_unpacked unpacked; + msgpack_unpack_return ret; + DmlValue *value = NULL; + + if (out_value == NULL) { + return DML_MSGPACK_ERR_INVALID; + } + *out_value = NULL; + + msgpack_unpacked_init(&unpacked); + ret = msgpack_unpack_next(&unpacked, data, size, NULL); + if (ret != MSGPACK_UNPACK_SUCCESS) { + msgpack_unpacked_destroy(&unpacked); + return DML_MSGPACK_ERR_INVALID; + } + + value = dml_msgpack_from_object(&unpacked.data); + msgpack_unpacked_destroy(&unpacked); + if (value == NULL) { + return DML_MSGPACK_ERR_INVALID; + } + + *out_value = value; + return DML_MSGPACK_OK; +} + +void +dml_msgpack_free_buffer(void *data) +{ + free(data); +} diff --git a/c/src/dml_value.c b/c/src/dml_value.c new file mode 100644 index 0000000..7e352f4 --- /dev/null +++ b/c/src/dml_value.c @@ -0,0 +1,278 @@ +#include +#include + +#include "../include/dml_value.h" + +static int +dml_value_map_entry_compare(const void *left, const void *right) +{ + const DmlMapEntry *a = (const DmlMapEntry *)left; + const DmlMapEntry *b = (const DmlMapEntry *)right; + size_t min_len = a->key_len < b->key_len ? a->key_len : b->key_len; + int cmp = 0; + + if (min_len > 0) { + cmp = memcmp(a->key, b->key, min_len); + } + if (cmp != 0) { + return cmp; + } + if (a->key_len < b->key_len) { + return -1; + } + if (a->key_len > b->key_len) { + return 1; + } + return 0; +} + +static DmlValue * +dml_value_alloc(DmlValueType type) +{ + DmlValue *value = (DmlValue *)calloc(1, sizeof(*value)); + if (value == NULL) { + return NULL; + } + value->type = type; + return value; +} + +DmlValue * +dml_value_new_null(void) +{ + return dml_value_alloc(DML_VALUE_NULL); +} + +DmlValue * +dml_value_new_bool(int value) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_BOOL); + if (result == NULL) { + return NULL; + } + result->as.boolean = value ? 1 : 0; + return result; +} + +DmlValue * +dml_value_new_int(long long value) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_INT); + if (result == NULL) { + return NULL; + } + result->as.integer = value; + return result; +} + +DmlValue * +dml_value_new_float(double value) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_FLOAT); + if (result == NULL) { + return NULL; + } + result->as.floating = value; + return result; +} + +DmlValue * +dml_value_new_str(const char *data, size_t size) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_STR); + if (result == NULL) { + return NULL; + } + result->as.str.data = (char *)malloc(size); + if (result->as.str.data == NULL) { + free(result); + return NULL; + } + if (size > 0 && data != NULL) { + memcpy(result->as.str.data, data, size); + } + result->as.str.size = size; + return result; +} + +DmlValue * +dml_value_new_ref(const char *data, size_t size) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_REF); + if (result == NULL) { + return NULL; + } + result->as.ref.data = (char *)malloc(size); + if (result->as.ref.data == NULL) { + free(result); + return NULL; + } + if (size > 0 && data != NULL) { + memcpy(result->as.ref.data, data, size); + } + result->as.ref.size = size; + return result; +} + +DmlValue * +dml_value_new_list(size_t count) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_LIST); + if (result == NULL) { + return NULL; + } + if (count == 0) { + return result; + } + result->as.list.items = (DmlValue **)calloc(count, sizeof(DmlValue *)); + if (result->as.list.items == NULL) { + free(result); + return NULL; + } + result->as.list.count = count; + return result; +} + +int +dml_value_list_set(DmlValue *list, size_t index, DmlValue *item) +{ + if (list == NULL || list->type != DML_VALUE_LIST) { + return -1; + } + if (index >= list->as.list.count) { + return -1; + } + list->as.list.items[index] = item; + return 0; +} + +DmlValue * +dml_value_new_map(size_t count) +{ + DmlValue *result = dml_value_alloc(DML_VALUE_MAP); + if (result == NULL) { + return NULL; + } + if (count == 0) { + return result; + } + result->as.map.entries = (DmlMapEntry *)calloc(count, sizeof(DmlMapEntry)); + if (result->as.map.entries == NULL) { + free(result); + return NULL; + } + result->as.map.count = count; + return result; +} + +int +dml_value_map_set(DmlValue *map, size_t index, const char *key, size_t key_len, DmlValue *value) +{ + char *key_copy = NULL; + + if (map == NULL || map->type != DML_VALUE_MAP) { + return -1; + } + if (index >= map->as.map.count) { + return -1; + } + + key_copy = (char *)malloc(key_len); + if (key_copy == NULL) { + return -1; + } + if (key_len > 0 && key != NULL) { + memcpy(key_copy, key, key_len); + } + + map->as.map.entries[index].key = key_copy; + map->as.map.entries[index].key_len = key_len; + map->as.map.entries[index].value = value; + return 0; +} + +int +dml_value_map_sort(DmlValue *map) +{ + if (map == NULL || map->type != DML_VALUE_MAP) { + return -1; + } + if (map->as.map.count < 2) { + return 0; + } + qsort(map->as.map.entries, map->as.map.count, sizeof(DmlMapEntry), dml_value_map_entry_compare); + return 0; +} + +void +dml_value_free(DmlValue *value) +{ + size_t i; + + if (value == NULL) { + return; + } + + switch (value->type) { + case DML_VALUE_STR: + free(value->as.str.data); + break; + case DML_VALUE_LIST: + for (i = 0; i < value->as.list.count; i++) { + dml_value_free(value->as.list.items[i]); + } + free(value->as.list.items); + break; + case DML_VALUE_MAP: + for (i = 0; i < value->as.map.count; i++) { + free(value->as.map.entries[i].key); + dml_value_free(value->as.map.entries[i].value); + } + free(value->as.map.entries); + break; + case DML_VALUE_REF: + free(value->as.ref.data); + break; + default: + break; + } + + free(value); +} + +int +dml_ref_split( + const char *ref, + size_t ref_len, + const char **namespace_str, + size_t *namespace_len, + const char **id_str, + size_t *id_len +) +{ + const char *slash = NULL; + size_t ns_len = 0; + size_t id_size = 0; + + if (ref == NULL || ref_len == 0 || namespace_str == NULL || namespace_len == NULL || id_str == NULL || + id_len == NULL) { + return -1; + } + + slash = memchr(ref, ':', ref_len); + if (slash == NULL || slash == ref) { + return -1; + } + + ns_len = (size_t)(slash - ref); + id_size = ref_len - ns_len - 1; + if (id_size > DML_REF_ID_MAX) { + return -1; + } + + *namespace_str = ref; + *namespace_len = ns_len; + *id_str = slash + 1; + *id_len = id_size; + return 0; +} diff --git a/c/third_party/lmdb/README.md b/c/third_party/lmdb/README.md new file mode 100644 index 0000000..1e72804 --- /dev/null +++ b/c/third_party/lmdb/README.md @@ -0,0 +1,18 @@ +# LMDB + +We vendored LMDB into `c/third_party/lmdb` from the canonical upstream release: + +- Source: https://github.com/LMDB/lmdb +- Release tag: `LMDB_0.9.31` + +Steps used: + +1. Download the release tarball for `LMDB_0.9.31`. +2. Extract only `libraries/liblmdb`. +3. Prune the directory to the minimal build inputs and licenses: + - `mdb.c`, `midl.c` + - `lmdb.h`, `midl.h` + - `LICENSE`, `COPYRIGHT` +4. Remove the tarball after extraction. + +If we update LMDB, repeat the same steps and keep the version pinned in this section. diff --git a/c/third_party/lmdb/libraries/liblmdb/COPYRIGHT b/c/third_party/lmdb/libraries/liblmdb/COPYRIGHT new file mode 100644 index 0000000..14eb149 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 2011-2021 Howard Chu, Symas Corp. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted only as authorized by the OpenLDAP +Public License. + +A copy of this license is available in the file LICENSE in the +top-level directory of the distribution or, alternatively, at +. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Individual files and/or contributed packages may be copyright by +other parties and/or subject to additional restrictions. + +This work also contains materials derived from public sources. + +Additional information about OpenLDAP can be obtained at +. diff --git a/c/third_party/lmdb/libraries/liblmdb/LICENSE b/c/third_party/lmdb/libraries/liblmdb/LICENSE new file mode 100644 index 0000000..05ad757 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/LICENSE @@ -0,0 +1,47 @@ +The OpenLDAP Public License + Version 2.8, 17 August 2003 + +Redistribution and use of this software and associated documentation +("Software"), with or without modification, are permitted provided +that the following conditions are met: + +1. Redistributions in source form must retain copyright statements + and notices, + +2. Redistributions in binary form must reproduce applicable copyright + statements and notices, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution, and + +3. Redistributions must contain a verbatim copy of this document. + +The OpenLDAP Foundation may revise this license from time to time. +Each revision is distinguished by a version number. You may use +this Software under terms of this license revision or under the +terms of any subsequent revision of the license. + +THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS +CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S) +OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +The names of the authors and copyright holders must not be used in +advertising or otherwise to promote the sale, use or other dealing +in this Software without specific, written prior permission. Title +to copyright in this Software shall at all times remain with copyright +holders. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Copyright 1999-2003 The OpenLDAP Foundation, Redwood City, +California, USA. All Rights Reserved. Permission to copy and +distribute verbatim copies of this document is granted. diff --git a/c/third_party/lmdb/libraries/liblmdb/lmdb.h b/c/third_party/lmdb/libraries/liblmdb/lmdb.h new file mode 100644 index 0000000..ff03c22 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/lmdb.h @@ -0,0 +1,1608 @@ +/** @file lmdb.h + * @brief Lightning memory-mapped database library + * + * @mainpage Lightning Memory-Mapped Database Manager (LMDB) + * + * @section intro_sec Introduction + * LMDB is a Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. The entire database is exposed + * in a memory map, and all data fetches return data directly + * from the mapped memory, so no malloc's or memcpy's occur during + * data fetches. As such, the library is extremely simple because it + * requires no page caching layer of its own, and it is extremely high + * performance and memory-efficient. It is also fully transactional with + * full ACID semantics, and when the memory map is read-only, the + * database integrity cannot be corrupted by stray pointer writes from + * application code. + * + * The library is fully thread-aware and supports concurrent read/write + * access from multiple processes and threads. Data pages use a copy-on- + * write strategy so no active data pages are ever overwritten, which + * also provides resistance to corruption and eliminates the need of any + * special recovery procedures after a system crash. Writes are fully + * serialized; only one write transaction may be active at a time, which + * guarantees that writers can never deadlock. The database structure is + * multi-versioned so readers run with no locks; writers cannot block + * readers, and readers don't block writers. + * + * Unlike other well-known database mechanisms which use either write-ahead + * transaction logs or append-only data writes, LMDB requires no maintenance + * during operation. Both write-ahead loggers and append-only databases + * require periodic checkpointing and/or compaction of their log or database + * files otherwise they grow without bound. LMDB tracks free pages within + * the database and re-uses them for new write operations, so the database + * size does not grow without bound in normal use. + * + * The memory map can be used as a read-only or read-write map. It is + * read-only by default as this provides total immunity to corruption. + * Using read-write mode offers much higher write performance, but adds + * the possibility for stray application writes thru pointers to silently + * corrupt the database. Of course if your application code is known to + * be bug-free (...) then this is not an issue. + * + * If this is your first time using a transactional embedded key/value + * store, you may find the \ref starting page to be helpful. + * + * @section caveats_sec Caveats + * Troubleshooting the lock file, plus semaphores on BSD systems: + * + * - A broken lockfile can cause sync issues. + * Stale reader transactions left behind by an aborted program + * cause further writes to grow the database quickly, and + * stale locks can block further operation. + * + * Fix: Check for stale readers periodically, using the + * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. + * Stale writers will be cleared automatically on some systems: + * - Windows - automatic + * - Linux, systems using POSIX mutexes with Robust option - automatic + * - not on BSD, systems using POSIX semaphores. + * Otherwise just make all programs using the database close it; + * the lockfile is always reset on first open of the environment. + * + * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * startup can fail due to semaphores owned by another userid. + * + * Fix: Open and close the database as the user which owns the + * semaphores (likely last user) or as root, while no other + * process is using the database. + * + * Restrictions/caveats (in addition to those listed for some functions): + * + * - Only the database owner should normally use the database on + * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. + * Multiple users can cause startup to fail later, as noted above. + * + * - There is normally no pure read-only mode, since readers need write + * access to locks and lock file. Exceptions: On read-only filesystems + * or with the #MDB_NOLOCK flag described under #mdb_env_open(). + * + * - An LMDB configuration will often reserve considerable \b unused + * memory address space and maybe file size for future growth. + * This does not use actual memory or disk space, but users may need + * to understand the difference so they won't be scared off. + * + * - By default, in versions before 0.9.10, unused portions of the data + * file might receive garbage data from memory freed by other code. + * (This does not happen when using the #MDB_WRITEMAP flag.) As of + * 0.9.10 the default behavior is to initialize such memory before + * writing to the data file. Since there may be a slight performance + * cost due to this initialization, applications may disable it using + * the #MDB_NOMEMINIT flag. Applications handling sensitive data + * which must not be written should not use this flag. This flag is + * irrelevant when using #MDB_WRITEMAP. + * + * - A thread can only use one transaction at a time, plus any child + * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. + * + * - Use an MDB_env* in the process which opened it, not after fork(). + * + * - Do not have open an LMDB database twice in the same process at + * the same time. Not even from a plain open() call - close()ing it + * breaks fcntl() advisory locking. (It is OK to reopen it after + * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) + * + * - Avoid long-lived transactions. Read transactions prevent + * reuse of pages freed by newer write transactions, thus the + * database can grow quickly. Write transactions prevent + * other write transactions, since writes are serialized. + * + * - Avoid suspending a process with active transactions. These + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: + * + * - Avoid aborting a process with an active transaction. + * The transaction becomes "long-lived" as above until a check + * for stale readers is performed or the lockfile is reset, + * since the process may not remove it from the lockfile. + * + * This does not apply to write transactions if the system clears + * stale writers, see above. + * + * - If you do that anyway, do a periodic check for stale readers. Or + * close the environment once in a while, so the lockfile can get reset. + * + * - Do not use LMDB databases on remote filesystems, even between + * processes on the same host. This breaks flock() on some OSes, + * possibly memory map sync, and certainly sync between programs + * on different hosts. + * + * - Opening a database can fail if another process is opening or + * closing it at exactly the same time. + * + * @author Howard Chu, Symas Corporation. + * + * @copyright Copyright 2011-2021 Howard Chu, Symas Corp. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * @par Derived From: + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _LMDB_H_ +#define _LMDB_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Unix permissions for creating files, or dummy definition for Windows */ +#ifdef _MSC_VER +typedef int mdb_mode_t; +#else +typedef mode_t mdb_mode_t; +#endif + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#ifdef _WIN32 +typedef void *mdb_filehandle_t; +#else +typedef int mdb_filehandle_t; +#endif + +/** @defgroup mdb LMDB API + * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager + */ +/** @defgroup Version Version Macros + * @{ + */ +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 31 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "July 10, 2023" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned int MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct MDB_val { + size_t mv_size; /**< size of the data item */ + void *mv_data; /**< address of the data item */ +} MDB_val; + +/** @brief A callback function used to compare two keys in a database */ +typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); + +/** @defgroup mdb_env Environment Flags + * @{ + */ + /** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 + /** no environment directory */ +#define MDB_NOSUBDIR 0x4000 + /** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 + /** read only */ +#define MDB_RDONLY 0x20000 + /** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 + /** use writable mmap */ +#define MDB_WRITEMAP 0x80000 + /** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 + /** don't do any locking, caller must manage their own locks */ +#define MDB_NOLOCK 0x400000 + /** don't do readahead (no effect on Windows) */ +#define MDB_NORDAHEAD 0x800000 + /** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 +/** @} */ + +/** @defgroup mdb_dbi_open Database Flags + * @{ + */ + /** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 + /** use sorted duplicates */ +#define MDB_DUPSORT 0x04 + /** numeric keys in native byte order: either unsigned int or size_t. + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 + /** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 + /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +#define MDB_INTEGERDUP 0x20 + /** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 + /** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdb_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT
+ * For put: don't write if the key and data pair already exist.
+ * For mdb_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdb_cursor_put: overwrite the current key/data pair */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdb_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return up to a page of duplicate data items + from current cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return up to a page of duplicate data items + from next cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ + MDB_PREV_MULTIPLE /**< Position at previous page and return up to + a page of duplicate data items. Only for #MDB_DUPFIXED */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ + /** Successful result */ +#define MDB_SUCCESS 0 + /** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) + /** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) + /** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) + /** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) + /** Update of meta page failed or environment had fatal error */ +#define MDB_PANIC (-30795) + /** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) + /** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) + /** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) + /** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) + /** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) + /** Too many TLS keys in use - Windows only */ +#define MDB_TLS_FULL (-30789) + /** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) + /** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) + /** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) + /** Operation and DB incompatible, or DB type changed. This can mean: + *
    + *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. + *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. + *
  • Accessing a data record as a database, or vice versa. + *
  • The database was dropped and recreated with different flags. + *
+ */ +#define MDB_INCOMPATIBLE (-30784) + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) + /** Transaction must abort, has a child, or is invalid */ +#define MDB_BAD_TXN (-30782) + /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) + /** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) + /** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_BAD_DBI +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDB_stat { + unsigned int ms_psize; /**< Size of a database page. + This is currently the same for all databases. */ + unsigned int ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDB_stat; + +/** @brief Information about the environment */ +typedef struct MDB_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ +} MDB_envinfo; + + /** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied here + * @param[out] minor if non-NULL, the library minor version number is copied here + * @param[out] patch if non-NULL, the library patch version number is copied here + * @retval "version string" The library version as a string + */ +char *mdb_version(int *major, int *minor, int *patch); + + /** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdb_strerror(int err); + + /** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdb_env_close(). + * Before the handle may be used, it must be opened using #mdb_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_create(MDB_env **env); + + /** @brief Open an environment handle. + * + * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdb_env_set_flags() are also used. + *
    + *
  • #MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the environment. + * If successful, the memory map will always reside at the same virtual address + * and pointers used to reference data items in the database will be constant + * across multiple invocations. This option may not always work, depending on + * how the operating system has allocated memory to shared libraries and other uses. + * The feature is highly experimental. + *
  • #MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock files + * under that directory. With this option, \b path is used as-is for + * the database main data file. The database lock file is the \b path + * with "-lock" appended. + *
  • #MDB_RDONLY + * Open the environment in read-only mode. No write operations will be + * allowed. LMDB will still modify the lock file - except on read-only + * filesystems, where LMDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses + * fewer mallocs but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * This may be slightly faster for DBs that fit entirely in RAM, but + * is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdb_env_sync etc). + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reserved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since LMDB's write locking is unaware of the user threads. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. + *
  • #MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * The option is not implemented on Windows. + *
  • #MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused spaces + * in the data file. By default, memory for pages written to the data + * file is obtained using malloc. While these pages may be reused in + * subsequent transactions, freshly malloc'd pages will be initialized + * to zeroes before use. This avoids persisting leftover data from other + * code (that used the heap and subsequently freed the memory) into the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio may + * use the heap for file I/O buffers. This initialization step has a + * modest performance cost so some applications may want to disable + * it using this flag. This option can be a problem for applications + * which handle sensitive data like passwords, and it makes memory + * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
+ * @param[in] mode The UNIX permissions to set on created files and semaphores. + * This parameter is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * version that created the database environment. + *
  • #MDB_INVALID - the environment file headers are corrupted. + *
  • ENOENT - the directory specified by the path parameter doesn't exist. + *
  • EACCES - the user didn't have permission to access the environment files. + *
  • EAGAIN - the environment was locked by another process. + *
+ */ +int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); + + /** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy(MDB_env *env, const char *path); + + /** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); + + /** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This option + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page leak. + *
+ * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags); + + /** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdb_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdb_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags); + + /** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ +int mdb_env_stat(MDB_env *env, MDB_stat *stat); + + /** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ +int mdb_env_info(MDB_env *env, MDB_envinfo *stat); + + /** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdb_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - the environment is read-only. + *
  • EINVAL - an invalid parameter was specified. + *
  • EIO - an error occurred during synchronization. + *
+ */ +int mdb_env_sync(MDB_env *env, int force); + + /** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this call. + * @param[in] env An environment handle returned by #mdb_env_create() + */ +void mdb_env_close(MDB_env *env); + + /** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdb_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff); + + /** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_flags(MDB_env *env, unsigned int *flags); + + /** @brief Return the path that was used in #mdb_env_open(). + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_path(MDB_env *env, const char **path); + + /** @brief Return the filedescriptor for the given environment. + * + * This function may be called after fork(), so the descriptor can be + * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. + * (Until LMDB 0.9.18, only the lockfile had that.) + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); + + /** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdb_env_create() and before #mdb_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdb_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used space. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment has + * an active write transaction. + *
+ */ +int mdb_env_set_mapsize(MDB_env *env, size_t size); + + /** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); + + /** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); + + /** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdb_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + + /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The maximum size of a key we can write + */ +int mdb_env_get_maxkeysize(MDB_env *env); + + /** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_userctx(MDB_env *env, void *ctx); + + /** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The pointer set by #mdb_env_set_userctx(). + */ +void *mdb_env_get_userctx(MDB_env *env); + + /** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg); + + /** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is built with NDEBUG. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); + + /** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdb_txn_commit and mdb_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_RDONLY + * This transaction will not perform any write operations. + *
+ * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdb_env_set_mapsize(). + *
  • #MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdb_env_set_maxreaders(). + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); + + /** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +MDB_env *mdb_txn_env(MDB_txn *txn); + + /** @brief Return the transaction's ID. + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A transaction ID, valid if input is an active transaction. + */ +size_t mdb_txn_id(MDB_txn *txn); + + /** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • ENOSPC - no more disk space. + *
  • EIO - a low-level I/O error occurred while writing. + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_commit(MDB_txn *txn); + + /** @brief Abandon all the operations of the transaction instead of saving them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_abort(MDB_txn *txn); + + /** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdb_txn_abort(), but keep the transaction + * handle. #mdb_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdb_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_reset(MDB_txn *txn); + + /** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdb_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_txn_renew(MDB_txn *txn); + +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) + + /** @brief Open a database in the environment. + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdb_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the handle will reside in the shared + * environment, and may be used by other transactions. + * + * This function must not be called from multiple concurrent + * transactions in the same process. A transaction that uses + * this function must finish (either commit or abort) before + * any other transaction in the process may use this function. + * + * To use named databases (with name != NULL), #mdb_env_set_maxdbs() + * must be called before opening the environment. Database names are + * keys in the unnamed database, and may be read but not written. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as strings and + * compared from beginning to end. + *
  • #MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another perspective, + * keys may have multiple data items, stored in sorted order.) By default + * keys must be unique and may have only a single data item. + *
  • #MDB_INTEGERKEY + * Keys are binary integers in native byte order, either unsigned int + * or size_t, and will be sorted as such. + * The keys must all be of the same size. + *
  • #MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. This option + * tells the library that the data items for this database are all the same + * size, which allows further optimizations in storage and retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple + * items at once. + *
  • #MDB_INTEGERDUP + * This option specifies that duplicate data items are binary integers, + * similar to #MDB_INTEGERKEY keys. + *
  • #MDB_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order. + *
  • #MDB_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + *
+ * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment + * and #MDB_CREATE was not specified. + *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). + *
+ */ +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); + + /** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); + + /** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); + + /** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdb_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdb_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); + + /** @brief Empty or delete+close a database. + * + * See #mdb_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); + + /** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the database. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating + * before longer keys. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a data + * item specified by the application with a data item currently stored in the database. + * This function only takes effect if the database was opened with the #MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating + * before longer items. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + + /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. + * + * See #mdb_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdb_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + + /** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdb_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the key was not in the database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted keys + * with this flag will cause a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdb_cursor_renew(). + * It can be discarded with #mdb_cursor_close(). + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdb_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + + /** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +void mdb_cursor_close(MDB_cursor *cursor); + + /** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + + /** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); + + /** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); + + /** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and length + * of the key are returned in the object to which \b key refers (except for the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b data + * refers. + * See #mdb_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - no matching key found. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + + /** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This flag + * must not be specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must be + * the size of a single data element. The mv_data of the first MDB_val + * must point to the beginning of the array of contiguous data elements. + * The mv_size of the second MDB_val must be the count of the number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The mv_data + * of the second MDB_val is unused. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * This does not invalidate the cursor, so operations such as MDB_NEXT + * can still be used on it. + * Both MDB_NEXT and MDB_GET_CURRENT will return the same record after + * this operation. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with #MDB_DUPSORT. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); + + /** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. + *
+ */ +int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int (MDB_msg_func)(const char *msg, void *ctx); + + /** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + + /** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdb_reader_check(MDB_env *env, int *dead); +/** @} */ + +#ifdef __cplusplus +} +#endif +/** @page tools LMDB Command Line Tools + The following describes the command line tools that are available for LMDB. + \li \ref mdb_copy_1 + \li \ref mdb_dump_1 + \li \ref mdb_load_1 + \li \ref mdb_stat_1 +*/ + +#endif /* _LMDB_H_ */ diff --git a/c/third_party/lmdb/libraries/liblmdb/mdb.c b/c/third_party/lmdb/libraries/liblmdb/mdb.c new file mode 100644 index 0000000..deb6779 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/mdb.c @@ -0,0 +1,10354 @@ +/** @file mdb.c + * @brief Lightning memory-mapped database library + * + * A Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. + */ +/* + * Copyright 2011-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#if defined(__WIN64__) +#define _FILE_OFFSET_BITS 64 +#endif +#ifdef _WIN32 +#include +#include +#include /* get wcscpy() */ + +/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it + * as int64 which is wrong. MSVC doesn't define it at all, so just + * don't use it. + */ +#define MDB_PID_T int +#define MDB_THR_T DWORD +#include +#include +#ifdef __GNUC__ +# include +#else +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# define BYTE_ORDER LITTLE_ENDIAN +# ifndef SSIZE_MAX +# define SSIZE_MAX INT_MAX +# endif +#endif +#else +#include +#include +#define MDB_PID_T pid_t +#define MDB_THR_T pthread_t +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif +#include +#endif + +#if defined(__mips) && defined(__linux) +/* MIPS has cache coherency issues, requires explicit cache control */ +#include +#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) +#else +#define CACHEFLUSH(addr, bytes, cache) +#endif + +#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) +/** fdatasync is broken on ext3/ext4fs on older kernels, see + * description in #mdb_env_open2 comments. You can safely + * define MDB_FDATASYNC_WORKS if this code will only be run + * on kernels 3.6 and newer. + */ +#define BROKEN_FDATASYNC +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +typedef SSIZE_T ssize_t; +#else +#include +#endif + +#if defined(__sun) || defined(ANDROID) +/* Most platforms have posix_memalign, older may only have memalign */ +#define HAVE_MEMALIGN 1 +#include +/* On Solaris, we need the POSIX sigwait function */ +#if defined (__sun) +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif +#endif + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110 +# define MDB_USE_POSIX_MUTEX 1 +# define MDB_USE_ROBUST 1 +#elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) +# define MDB_USE_POSIX_SEM 1 +# define MDB_FDATASYNC fsync +#elif defined(ANDROID) +# define MDB_FDATASYNC fsync +#endif + +#ifndef _WIN32 +#include +#include +#ifdef MDB_USE_POSIX_SEM +# define MDB_USE_HASH 1 +#include +#else +#define MDB_USE_POSIX_MUTEX 1 +#endif +#endif + +#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ + + defined(MDB_USE_POSIX_MUTEX) != 1 +# error "Ambiguous shared-lock implementation" +#endif + +#ifdef USE_VALGRIND +#include +#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) +#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) +#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) +#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) +#else +#define VGMEMP_CREATE(h,r,z) +#define VGMEMP_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) +#define VGMEMP_DESTROY(h) +#define VGMEMP_DEFINED(a,s) +#endif + +#ifndef BYTE_ORDER +# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# ifdef _LITTLE_ENDIAN +# define BYTE_ORDER LITTLE_ENDIAN +# else +# define BYTE_ORDER BIG_ENDIAN +# endif +# else +# define BYTE_ORDER __BYTE_ORDER +# endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#include "lmdb.h" +#include "midl.h" + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +# error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +# error "Two's complement, reasonably sized integer types, please" +#endif + +#if (((__clang_major__ << 8) | __clang_minor__) >= 0x0302) || (((__GNUC__ << 8) | __GNUC_MINOR__) >= 0x0403) +/** Mark infrequently used env functions as cold. This puts them in a separate + * section, and optimizes them for size */ +#define ESECT __attribute__ ((cold)) +#else +/* On older compilers, use a separate section */ +# ifdef __GNUC__ +# ifdef __APPLE__ +# define ESECT __attribute__ ((section("__TEXT,text_env"))) +# else +# define ESECT __attribute__ ((section("text_env"))) +# endif +# else +# define ESECT +# endif +#endif + +#ifdef _WIN32 +#define CALL_CONV WINAPI +#else +#define CALL_CONV +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup compat Compatibility Macros + * A bunch of macros to minimize the amount of platform-specific ifdefs + * needed throughout the rest of the code. When the features this library + * needs are similar enough to POSIX to be hidden in a one-or-two line + * replacement, this macro approach is used. + * @{ + */ + + /** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif + + /** Wrapper around __func__, which is a C99 feature */ +#if __STDC_VERSION__ >= 199901L +# define mdb_func_ __func__ +#elif __GNUC__ >= 2 || _MSC_VER >= 1300 +# define mdb_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +# define mdb_func_ "" +#endif + +/* Internal error codes, not exposed outside liblmdb */ +#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) +#ifdef _WIN32 +#define MDB_OWNERDEAD ((int) WAIT_ABANDONED) +#elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) +#define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ +#endif + +#ifdef __GLIBC__ +#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) +#endif +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0, or use some other + * mechanism like -DMDB_USE_POSIX_SEM instead of + * -DMDB_USE_POSIX_MUTEX. + * (Posix semaphores are not robust.) + */ +#ifndef MDB_USE_ROBUST +/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ + (defined(__GLIBC__) && GLIBC_VER < 0x020004)) +# define MDB_USE_ROBUST 0 +# else +# define MDB_USE_ROBUST 1 +# endif +#endif /* !MDB_USE_ROBUST */ + +#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) +/* glibc < 2.12 only provided _np API */ +# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ + (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) +# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP +# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) +# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) +# endif +#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ + +#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) +#define MDB_ROBUST_SUPPORTED 1 +#endif + +#ifdef _WIN32 +#define MDB_USE_HASH 1 +#define MDB_PIDLOCK 0 +#define THREAD_RET DWORD +#define pthread_t HANDLE +#define pthread_mutex_t HANDLE +#define pthread_cond_t HANDLE +typedef HANDLE mdb_mutex_t, mdb_mutexref_t; +#define pthread_key_t DWORD +#define pthread_self() GetCurrentThreadId() +#define pthread_key_create(x,y) \ + ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) +#define pthread_key_delete(x) TlsFree(x) +#define pthread_getspecific(x) TlsGetValue(x) +#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) +#define pthread_mutex_unlock(x) ReleaseMutex(*x) +#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) +#define pthread_cond_signal(x) SetEvent(*x) +#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) +#define THREAD_CREATE(thr,start,arg) \ + (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) +#define THREAD_FINISH(thr) \ + (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) +#define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) +#define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) +#define mdb_mutex_consistent(mutex) 0 +#define getpid() GetCurrentProcessId() +#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) +#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) +#define ErrCode() GetLastError() +#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} +#define close(fd) (CloseHandle(fd) ? 0 : -1) +#define munmap(ptr,len) UnmapViewOfFile(ptr) +#ifdef PROCESS_QUERY_LIMITED_INFORMATION +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION +#else +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 +#endif +#define Z "I" +#else +#define THREAD_RET void * +#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) +#define THREAD_FINISH(thr) pthread_join(thr,NULL) +#define Z "z" /**< printf format modifier for size_t */ + + /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ +#define MDB_PIDLOCK 1 + +#ifdef MDB_USE_POSIX_SEM + +typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; +#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) +#define UNLOCK_MUTEX(mutex) sem_post(mutex) + +static int +mdb_sem_wait(sem_t *sem) +{ + int rc; + while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; + return rc; +} + +#else /* MDB_USE_POSIX_MUTEX: */ + /** Shared mutex/semaphore as the original is stored. + * + * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. + * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it + * is array[size 1] so it can be assigned to the pointer. + */ +typedef pthread_mutex_t mdb_mutex_t[1]; + /** Reference to an #mdb_mutex_t */ +typedef pthread_mutex_t *mdb_mutexref_t; + /** Lock the reader or writer mutex. + * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). + */ +#define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) + /** Unlock the reader or writer mutex. + */ +#define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) + /** Mark mutex-protected data as repaired, after death of previous owner. + */ +#define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) +#endif /* MDB_USE_POSIX_SEM */ + + /** Get the error code for the last failed system function. + */ +#define ErrCode() errno + + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + + /** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + + /** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) +#endif + +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#define MNAME_LEN 32 +#else +#define MNAME_LEN (sizeof(pthread_mutex_t)) +#endif + +/** @} */ + +#ifdef MDB_ROBUST_SUPPORTED + /** Lock mutex, handle any error, set rc = result. + * Return 0 on success, nonzero (not rc) on error. + */ +#define LOCK_MUTEX(rc, env, mutex) \ + (((rc) = LOCK_MUTEX0(mutex)) && \ + ((rc) = mdb_mutex_failed(env, mutex, rc))) +static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); +#else +#define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) +#define mdb_mutex_failed(env, mutex, rc) (rc) +#endif + +#ifndef _WIN32 +/** A flag for opening a file and requesting synchronous data writes. + * This is only used when writing a meta page. It's not strictly needed; + * we could just do a normal write and then immediately perform a flush. + * But if this flag is available it saves us an extra system call. + * + * @note If O_DSYNC is undefined but exists in /usr/include, + * preferably set some compiler flag to get the definition. + */ +#ifndef MDB_DSYNC +# ifdef O_DSYNC +# define MDB_DSYNC O_DSYNC +# else +# define MDB_DSYNC O_SYNC +# endif +#endif +#endif + +/** Function for flushing the data of a file. Define this to fsync + * if fdatasync() is not supported. + */ +#ifndef MDB_FDATASYNC +# define MDB_FDATASYNC fdatasync +#endif + +#ifndef MDB_MSYNC +# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) +#endif + +#ifndef MS_SYNC +#define MS_SYNC 1 +#endif + +#ifndef MS_ASYNC +#define MS_ASYNC 0 +#endif + + /** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + + /** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +#ifndef MDB_DEBUG + /** Enable debug output. Needs variable argument macros (a C99 feature). + * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs + * read from and written to the database (used for free space management). + */ +#define MDB_DEBUG 0 +#endif + +#if MDB_DEBUG +static int mdb_debug; +static txnid_t mdb_debug_start; + + /** Print a debug message with printf formatting. + * Requires double parenthesis around 2 or more args. + */ +# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) +# define DPRINTF0(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(args) ((void) 0) +#endif + /** Print a debug string. + * The string is printed literally, with no format processing. + */ +#define DPUTS(arg) DPRINTF(("%s", arg)) + /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + + /** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + + /** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + + /** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + + /** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) + /** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION 1 + + /** @brief The max size of a key we can write, or 0 for computed max. + * + * This macro should normally be left alone or set to 0. + * Note that a database with big keys or dupsort data cannot be + * reliably modified by a liblmdb which uses a smaller max. + * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + * + * Other values are allowed, for backwards compat. However: + * A value bigger than the computed max can break if you do not + * know what you are doing, and liblmdb <= 0.9.10 can break when + * modifying a DB with keys/dupsort data bigger than its max. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + + /** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey) +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +#if MDB_DEBUG + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) + /** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] + /** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdb_dkey(x, kbuf) +#else +#define DKBUF +#define DKEY(x) 0 +#endif + + /** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + + /** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + + /** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + + /** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + + /** Default size of memory map. + * This is certainly too small for any actual applications. Apps should always set + * the size explicitly using #mdb_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent read + * transactions started by the same thread need no further locking to proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * + * No reader table is used if the database is on a read-only filesystem, or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. + * @{ + */ + /** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using #mdb_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + + /** The size of a CPU cache line in bytes. We want our lock structures + * aligned to this size to avoid false cache line sharing in the + * lock table. + * This value works for most CPUs. For Itanium this should be 128. + */ +#ifndef CACHELINE +#define CACHELINE 64 +#endif + + /** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile MDB_PID_T mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile MDB_THR_T mrb_tid; +} MDB_rxbody; + + /** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; + /** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mru; +} MDB_reader; + + /** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#else + /** Mutex protecting access to this table. + * This is the reader table lock used with LOCK_MUTEX(). + */ + mdb_mutex_t mtb_rmutex; +#endif + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + volatile txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + volatile unsigned mtb_numreaders; +} MDB_txbody; + + /** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_rmutex mt1.mtb.mtb_rmutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt1; + union { +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mt2_wmname[MNAME_LEN]; +#define mti_wmname mt2.mt2_wmname +#else + mdb_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex +#endif + char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + + /** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t) \ + ((MDB_LOCK_VERSION) \ + /* Flags which describe functionality */ \ + + (((MDB_PIDLOCK) != 0) << 16))) +/** @} */ + +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_pad; /**< key size if this is a LEAF2 page */ +/** @defgroup mdb_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ +/** @} */ + uint16_t mp_flags; /**< @ref mdb_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[0]; /**< dynamic size */ +} MDB_page; + +/** Alternate page header, for 2-byte aligned access */ +typedef struct MDB_page2 { + uint16_t mp2_p[sizeof(pgno_t)/2]; + uint16_t mp2_pad; + uint16_t mp2_flags; + indx_t mp2_lower; + indx_t mp2_upper; + indx_t mp2_ptrs[0]; +} MDB_page2; + +#define MP_PGNO(p) (((MDB_page2 *)(void *)(p))->mp2_p) +#define MP_PAD(p) (((MDB_page2 *)(void *)(p))->mp2_pad) +#define MP_FLAGS(p) (((MDB_page2 *)(void *)(p))->mp2_flags) +#define MP_LOWER(p) (((MDB_page2 *)(void *)(p))->mp2_lower) +#define MP_UPPER(p) (((MDB_page2 *)(void *)(p))->mp2_upper) +#define MP_PTRS(p) (((MDB_page2 *)(void *)(p))->mp2_ptrs) + + /** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) + + /** Address of first usable data byte in a page, after the header */ +#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + + /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + + /** Number of nodes on a page */ +#define NUMKEYS(p) ((MP_LOWER(p) - (PAGEHDRSZ-PAGEBASE)) >> 1) + + /** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)(MP_UPPER(p) - MP_LOWER(p)) + + /** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) + /** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + + /** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET(MP_FLAGS(p), P_LEAF) + /** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET(MP_FLAGS(p), P_LEAF2) + /** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET(MP_FLAGS(p), P_BRANCH) + /** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET(MP_FLAGS(p), P_OVERFLOW) + /** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET(MP_FLAGS(p), P_SUBP) + + /** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) + + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + + /** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). + */ +typedef struct MDB_node { + /** part of data size or pgno + * @{ */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; +#else + unsigned short mn_hi, mn_lo; +#endif + /** @} */ +/** @defgroup mdb_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdb_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) + +/** @} */ + unsigned short mn_flags; /**< @ref mdb_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + + /** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + + /** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + + /** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + + /** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + + /** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + MP_PTRS(p)[i] + PAGEBASE)) + + /** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + + /** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + + /** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) + /** Set the page number in a branch node */ +#define SETPGNO(node,pgno) do { \ + (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) + + /** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) + /** Set the size of the data for a leaf node */ +#define SETDSZ(node,size) do { \ + (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) + /** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + + /** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst,src) dst = src +#undef MP_PGNO +#define MP_PGNO(p) ((p)->mp_pgno) +#else +#if SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#else +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#endif +#endif + /** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) + + /** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } + + /** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } + + /** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_pad; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdb_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ +#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ + MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) + + /** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 + /** Handle for the default DB. */ +#define MAIN_DBI 1 + /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + + /** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + + /** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_pad + /** Any persistent environment flags. @ref mdb_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +} MDB_meta; + + /** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + + /** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + + /** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned int *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ +/** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdb_txn Transaction Flags + * @ingroup internal + * @{ + */ + /** #mdb_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ + /** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) +/** @} */ + unsigned int mt_flags; /**< @ref mdb_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned int mt_dirty_room; +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + + /** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ +/** @defgroup mdb_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +/** @} */ + unsigned int mc_flags; /**< @ref mdb_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + + /** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + + /** Check if there is an inited xcursor */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed + * when the node which contains the sub-page may have moved. Called + * with leaf page \b mp = mc->mc_pg[\b top]. + */ +#define XCURSOR_REFRESH(mc, top, mp) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node; \ + if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ + xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + + /** The database environment. */ +struct MDB_env { + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + HANDLE me_mfd; /**< For writing and syncing the meta pages */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U + /** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + /** fdatasync is unreliable */ +#define MDB_FSYNCONLY 0x08000000U + uint32_t me_flags; /**< @ref mdb_env */ + unsigned int me_psize; /**< DB page size, inited from me_os_psize */ + unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned int me_maxreaders; /**< size of the reader table */ + /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ + volatile int me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + MDB_PID_T me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ + MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + off_t me_size; /**< current file size */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + int me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned int me_nodemax; +#if !(MDB_MAXKEYSIZE) + unsigned int me_maxkey; /**< max size of a key */ +#endif + int me_live_reader; /**< have liveness lock in reader table */ +#ifdef _WIN32 + int me_pidquery; /**< Used in OpenProcess */ +#endif +#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ +# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ +# define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ +#else + mdb_mutex_t me_rmutex; + mdb_mutex_t me_wmutex; +#endif + void *me_userctx; /**< User-settable context */ + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ +}; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + + /** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + + /** max bytes to write in one call */ +#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) + + /** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + + /** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); + +#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ + "reset-tmp", "fail-begin", "fail-beginchild"} +enum { + /* mdb_txn_end operation number, for logging */ + MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, + MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static void mdb_txn_end(MDB_txn *txn, unsigned mode); + +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, + MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdb_page_search(MDB_cursor *mc, + MDB_val *key, int flags); +static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned int nflags); + +static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static MDB_meta *mdb_env_pick_meta(const MDB_env *env); +static int mdb_env_write_meta(MDB_txn *txn); +#if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */ +# define mdb_env_close0(env, excl) mdb_env_close1(env) +#endif +static void mdb_env_close0(MDB_env *env, int excl); + +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdb_branch_size(MDB_env *env, MDB_val *key); + +static int mdb_rebalance(MDB_cursor *mc); +static int mdb_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdb_cursor_pop(MDB_cursor *mc); +static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdb_cursor_del0(MDB_cursor *mc); +static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); +static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, + int *exactp); +static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); +static void mdb_xcursor_init0(MDB_cursor *mc); +static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); + +static int mdb_drop0(MDB_cursor *mc, int subs); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); + +/** @cond */ +static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; +/** @endcond */ + +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + +#ifdef _WIN32 +static SECURITY_DESCRIPTOR mdb_null_sd; +static SECURITY_ATTRIBUTES mdb_all_sa; +static int mdb_sec_inited; + +struct MDB_name; +static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); +#endif + +/** Return the library version info. */ +char * ESECT +mdb_version(int *major, int *minor, int *patch) +{ + if (major) *major = MDB_VERSION_MAJOR; + if (minor) *minor = MDB_VERSION_MINOR; + if (patch) *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdb_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", +}; + +char * +mdb_strerror(int err) +{ +#ifdef _WIN32 + /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. + * This works as long as no function between the call to mdb_strerror + * and the actual use of the message uses more than 4K of stack. + */ +#define MSGSIZE 1024 +#define PADSIZE 4096 + char buf[MSGSIZE+PADSIZE], *ptr = buf; +#endif + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdb_errstr[i]; + } + +#ifdef _WIN32 + /* These are the C-runtime error codes we use. The comment indicates + * their numeric value, and the Win32 error they would correspond to + * if the error actually came from a Win32 API. A major mess, we should + * have used LMDB-specific error codes for everything. + */ + switch(err) { + case ENOENT: /* 2, FILE_NOT_FOUND */ + case EIO: /* 5, ACCESS_DENIED */ + case ENOMEM: /* 12, INVALID_ACCESS */ + case EACCES: /* 13, INVALID_DATA */ + case EBUSY: /* 16, CURRENT_DIRECTORY */ + case EINVAL: /* 22, BAD_COMMAND */ + case ENOSPC: /* 28, OUT_OF_PAPER */ + return strerror(err); + default: + ; + } + buf[0] = 0; + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); + return ptr; +#else + if (err < 0) + return "Invalid error code"; + return strerror(err); +#endif +} + +/** assert(3) variant in cursor context */ +#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) +/** assert(3) variant in transaction context */ +#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) +/** assert(3) variant in environment context */ +#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) + +#ifndef NDEBUG +# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ + mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) + +static void ESECT +mdb_assert_fail(MDB_env *env, const char *expr_txt, + const char *func, const char *file, int line) +{ + char buf[400]; + sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", + file, line, expr_txt, func); + if (env->me_assert_func) + env->me_assert_func(env, buf); + fprintf(stderr, "%s\n", buf); + abort(); +} +#else +# define mdb_assert0(env, expr, expr_txt) ((void) 0) +#endif /* NDEBUG */ + +#if MDB_DEBUG +/** Return the page number of \b mp which may be sub-page, for debug output */ +static pgno_t +mdb_dbg_pgno(MDB_page *mp) +{ + pgno_t ret; + COPY_PGNO(ret, MP_PGNO(mp)); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char * +mdb_dkey(MDB_val *key, char *buf) +{ + char *ptr = buf; + unsigned char *c = key->mv_data; + unsigned int i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; + /* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. + */ +#if 1 + buf[0] = '\0'; + for (i=0; imv_size; i++) + ptr += sprintf(ptr, "%02x", *c++); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +static const char * +mdb_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +void +mdb_page_list(MDB_page *mp) +{ + pgno_t pgno = mdb_dbg_pgno(mp); + const char *type, *state = (MP_FLAGS(mp) & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned int i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (MP_FLAGS(mp) & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); + return; + default: + fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, MP_FLAGS(mp)); + return; + } + + nkeys = NUMKEYS(mp); + fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + + for (i=0; imp_pad; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = NODEPTR(mp, i); + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; + if (IS_BRANCH(mp)) { + fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + fprintf(stderr, "key %d: nsize %d, %s%s\n", + i, nsize, DKEY(&key), mdb_leafnode_type(node)); + } + total = EVEN(total); + } + fprintf(stderr, "Total: header %d + contents %d + unused %d\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + MP_LOWER(mp), total, SIZELEFT(mp)); +} + +void +mdb_cursor_chk(MDB_cursor *mc) +{ + unsigned int i; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; imc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) + printf("oops!\n"); + } + if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) + printf("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + printf("blah!\n"); + } + } +} +#endif + +#if (MDB_DEBUG) > 2 +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdb_audit(MDB_txn *txn) +{ + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdb_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; imt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdb_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j=0; jmn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += db.md_branch_pages + db.md_leaf_pages + + db.md_overflow_pages; + } + } + } + mdb_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", + txn->mt_txnid, freecount, count+NUM_METAS, + freecount+count+NUM_METAS, txn->mt_next_pgno); + } +} +#endif + +int +mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int +mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + return dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. + */ +static MDB_page * +mdb_page_malloc(MDB_txn *txn, unsigned num) +{ + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + /* For ! #MDB_NOMEMINIT, psize counts how much to init. + * For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. + */ + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + return ret; +} +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static void +mdb_page_free(MDB_env *env, MDB_page *mp) +{ + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; +} + +/** Free a dirty page */ +static void +mdb_dpage_free(MDB_env *env, MDB_page *dp) +{ + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void +mdb_dlist_free(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int +mdb_page_loose(MDB_cursor *mc, MDB_page *mp) +{ + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + mp->mp_pgno)); + NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); + if (rc) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int +mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) +{ + enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3, *m0 = mc; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors */ + if (mc->mc_flags & C_UNTRACK) + mc = NULL; /* will find mc in mt_cursors */ + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + /* Mark dirty root pages */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdb_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdb_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdb_page_touch(). Such references are + * handled by #mdb_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) +{ + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi >= CORE_DBS) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } else { + /* purge deleted slots */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=0; + for (i=1; i<=num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + /* Preserve pages which may soon be dirtied again */ + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. + */ + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE|P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. + */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) + goto done; + need--; + } + mdb_midl_sort(txn->mt_spill_pgs); + + /* Flush the spilled part of dirty list */ + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + +done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +static txnid_t +mdb_find_oldest(MDB_txn *txn) +{ + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + } + return oldest; +} + +/** Add a page to the txn's dirty list */ +static void +mdb_page_dirty(MDB_txn *txn, MDB_page *mp) +{ + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) +{ +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* Get at most more freeDB records once me_pghead + * has enough pages. If not enough, use new pages from the map. + * If and mc is updating the freeDB, only get new + * records if me_pghead is empty. Then the freelist cannot play + * catch-up with itself by growing while trying to save it. + */ + enum { Paranoid = 1, Max_retries = 500 }; +#else + enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; +#endif + int rc, retry = num * 60; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; + int found_old = 0; + + /* If there are any loose pages, just use them */ + if (num == 1 && txn->mt_loose_pgs) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + np->mp_pgno)); + *mp = np; + return MDB_SUCCESS; + } + + *mp = NULL; + + /* If our dirty list is already full, we can't do anything */ + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. + */ + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (--retry < 0) + break; + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + last = env->me_pglast; + oldest = env->me_pgoldest; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; /* will look up last+1 */ + key.mv_size = sizeof(last); + } + if (Paranoid && mc->mc_dbi == FREE_DBI) + retry = -1; + } + if (Paranoid && retry < 0 && mop_len) + break; + + last++; + /* Do not fetch more if the record will be too recent */ + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) + goto fail; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; +#if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (j = i; j; j--) + DPRINTF(("IDL %"Z"u", idl[j])); +#endif + /* Merge in descending sorted order */ + mdb_midl_xmerge(mop, idl); + mop_len = mop[0]; + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; + } + +search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void +mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) +{ + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. + */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int +mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. + */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_touch(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(MP_FLAGS(mp), P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; + } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { + return 0; + } + + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2 == mc) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (IS_LEAF(np)) + XCURSOR_REFRESH(m2, mc->mc_top, np); + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_env_sync(MDB_env *env, int force) +{ + int rc = 0; + if (env->me_flags & MDB_RDONLY) + return EACCES; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); +#ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); +#endif + } else { +#ifdef BROKEN_FDATASYNC + if (env->me_flags & MDB_FSYNCONLY) { + if (fsync(env->me_fd)) + rc = ErrCode(); + } else +#endif + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int +mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) +{ + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. + */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = dst; + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void +mdb_cursors_close(MDB_txn *txn, unsigned merge) +{ + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + free(mc); + } + cursors[i] = NULL; + } +} + +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ +enum Pidlock_op { + Pidset, Pidcheck +}; +#else +enum Pidlock_op { + Pidset = F_SETLK, Pidcheck = F_GETLK +}; +#endif + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + * + * On Windows Pidset is a no-op, we merely check for the existence + * of the process with the given pid. On POSIX we use a single byte + * lock on the lockfile, set at an offset equal to the pid. + */ +static int +mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) +{ +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ + int ret = 0; + HANDLE h; + if (op == Pidcheck) { + h = OpenProcess(env->me_pidquery, FALSE, pid); + /* No documented "no such process" code, but other program use this: */ + if (!h) + return ErrCode() != ERROR_INVALID_PARAMETER; + /* A process exists until all handles to it close. Has it exited? */ + ret = WaitForSingleObject(h, 0) != 0; + CloseHandle(h); + } + return ret; +#else + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = ErrCode()) == EINTR) { + continue; + } + return rc; + } +#endif +} + +/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int +mdb_txn_renew0(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr, flags = txn->mt_flags; + uint16_t x; + int rc, new_notls = 0; + + if ((flags &= MDB_TXN_RDONLY) != 0) { + if (!ti) { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + MDB_THR_T tid = pthread_self(); + mdb_mutexref_t rmutex = env->me_rmutex; + + if (!env->me_live_reader) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_live_reader = 1; + } + + if (LOCK_MUTEX(rc, env, rmutex)) + return rc; + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX(rmutex); + return MDB_READERS_FULL; + } + r = &ti->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdb_env_close() to touch it. + * When it will be closed, we can finally claim it. + */ + r->mr_pid = 0; + r->mr_txnid = (txnid_t)-1; + r->mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + env->me_close_readers = nr; + r->mr_pid = pid; + UNLOCK_MUTEX(rmutex); + + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + do /* LY: Retry on a race, ITS#7970. */ + r->mr_txnid = ti->mti_txnid; + while(r->mr_txnid != ti->mti_txnid); + txn->mt_txnid = r->mr_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } + + } else { + /* Not yet touching txn == env->me_txn0, it may be active */ + if (ti) { + if (LOCK_MUTEX(rc, env, env->me_wmutex)) + return rc; + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + } + txn->mt_txnid++; +#if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; +#endif + txn->mt_child = NULL; + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); + } + + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg+1; + + txn->mt_flags = flags; + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + for (i=CORE_DBS; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; + } + txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; + + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (env->me_maxpg < txn->mt_next_pgno) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; + } + mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; +} + +int +mdb_txn_renew(MDB_txn *txn) +{ + int rc; + + if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) + return EINVAL; + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; +} + +int +mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) +{ + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize; + + flags &= MDB_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDB_WRITEMAP; + + if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ + return EACCES; + + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + flags |= parent->mt_flags; + if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + /* Child txns save MDB_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db)+1); + size += tsize = sizeof(MDB_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdb_txn_renew0() succeeds, since it currently may be active. + */ + txn = env->me_txn0; + goto renew; + } + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(errno))); + return ENOMEM; + } + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; +renew: + rc = mdb_txn_renew0(txn); + } + if (rc) { + if (txn != env->me_txn0) + free(txn); + } else { + txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } + + return rc; +} + +MDB_env * +mdb_txn_env(MDB_txn *txn) +{ + if(!txn) return NULL; + return txn->mt_env; +} + +size_t +mdb_txn_id(MDB_txn *txn) +{ + if(!txn) return 0; + return txn->mt_txnid; +} + +/** Export or close DBI handles opened in this txn. */ +static void +mdb_dbis_update(MDB_txn *txn, int keep) +{ + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction + */ +static void +mdb_txn_end(MDB_txn *txn, unsigned mode) +{ + MDB_env *env = txn->mt_env; +#if MDB_DEBUG + static const char *const names[] = MDB_END_NAMES; +#endif + + /* Export or close DBI handles opened in this txn */ + mdb_dbis_update(txn, mode & MDB_END_UPDATE); + + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + names[mode & MDB_END_OPMASK], + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) { + txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ + } + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + pgno_t *pghead = env->me_pghead; + + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdb_cursors_close(txn, 0); + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + + if (!txn->mt_parent) { + mdb_midl_shrink(&txn->mt_free_pgs); + env->me_free_pgs = txn->mt_free_pgs; + /* me_pgstate: */ + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + + /* The writer mutex was locked in mdb_txn_begin. */ + if (env->me_txns) + UNLOCK_MUTEX(env->me_wmutex); + } else { + txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + free(txn->mt_u.dirty_list); + } + mdb_midl_free(txn->mt_spill_pgs); + + mdb_midl_free(pghead); + } + + if (mode & MDB_END_FREE) + free(txn); +} + +void +mdb_txn_reset(MDB_txn *txn) +{ + if (txn == NULL) + return; + + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + + mdb_txn_end(txn, MDB_END_RESET); +} + +void +mdb_txn_abort(MDB_txn *txn) +{ + if (txn == NULL) + return; + + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); + + mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int +mdb_freelist_save(MDB_txn *txn) +{ + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. + */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + /* Make sure first page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (!env->me_pghead && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. + */ + MDB_page *mp = txn->mt_loose_pgs; + MDB_ID2 *dl = txn->mt_u.dirty_list; + unsigned x; + if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* must also remove from dirty list */ + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + for (x=1; x<=dl[0].mid; x++) + if (dl[x].mid == mp->mp_pgno) + break; + mdb_tassert(txn, x <= dl[0].mid); + } else { + x = mdb_mid2l_search(dl, mp->mp_pgno); + mdb_tassert(txn, dl[x].mid == mp->mp_pgno); + mdb_dpage_free(env, mp); + } + dl[x].mptr = NULL; + } + { + /* squash freed slots out of the dirty list */ + unsigned y; + for (y=1; dl[y].mptr && y <= dl[0].mid; y++); + if (y <= dl[0].mid) { + for(x=y, y++;;) { + while (!dl[y].mptr && y <= dl[0].mid) y++; + if (y > dl[0].mid) break; + dl[x++] = dl[y++]; + } + dl[0].mid = x-1; + } else { + /* all slots freed */ + dl[0].mid = 0; + } + } + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. + */ + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. + */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + head_room = 0; + } + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. + */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) + return rc; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[ ++count ] = mp->mp_pgno; + loose[0] = count; + mdb_midl_sort(loose); + mdb_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_flush(MDB_txn *txn, int keep) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; +#ifdef _WIN32 + OVERLAPPED ov; +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; +#endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); + return rc; + } +#else + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { +retry_write: + /* Write previous page(s) */ +#ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { +retry_seek: + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_seek; + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } +#endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_write; + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; /* TODO: Use which error code? */ + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif /* _WIN32 */ + } + + /* MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); + + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int +mdb_txn_commit(MDB_txn *txn) +{ + int rc; + unsigned int i, end_mode; + MDB_env *env; + + if (txn == NULL) + return EINVAL; + + /* mdb_txn_end() mode for a commit which writes nothing */ + end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; + + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + if (rc) + goto fail; + } + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + goto done; + } + + if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { + DPUTS("txn has failed/finished, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + /* Append our free list to parent's */ + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. + */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdb_cursors_close(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i=CORE_DBS; imt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + /* Squash deleted pagenums if we deleted any */ + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { + for (i=1; i<=txn->mt_spill_pgs[0]; i++) { + MDB_ID pn = txn->mt_spill_pgs[i]; + if (pn & 1) + continue; /* deleted spillpg */ + pn >>= 1; + y = mdb_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y+1]; + y++; + } + dst[0].mid--; + } + } + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (TXN_DBI_CHANGED(txn, i)) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, + F_SUBDATA); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + mdb_midl_shrink(&txn->mt_free_pgs); + +#if (MDB_DEBUG) > 2 + mdb_audit(txn); +#endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + +done: + mdb_txn_end(txn, end_mode); + return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_read_header(MDB_env *env, MDB_meta *meta) +{ + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + for (i=off=0; imm_psize) { +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; +#else + rc = pread(env->me_fd, &pbuf, Size, off); +#endif + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF(("read: %s", mdb_strerror(rc))); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + m->mm_version, MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } + + if (off == 0 || m->mm_txnid > meta->mm_txnid) + *meta = *m; + } + return 0; +} + +/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +static void ESECT +mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) +{ + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = NUM_METAS-1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[in] meta the #MDB_meta to write + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_init_meta(MDB_env *env, MDB_meta *meta) +{ + MDB_page *p, *q; + int rc; + unsigned int psize; +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + ov.Offset = pos; \ + rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) +#else + int len; +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + len = pwrite(fd, ptr, size, pos); \ + if (len == -1 && ErrCode() == EINTR) continue; \ + rc = (len >= 0); break; } while(1) +#endif + + DPUTS("writing new meta page"); + + psize = env->me_psize; + + p = calloc(NUM_METAS, psize); + if (!p) + return ENOMEM; + + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + + DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); + if (!rc) + rc = ErrCode(); + else if ((unsigned) len == psize * NUM_METAS) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +/** Update the environment info to commit a transaction. + * @param[in] txn the transaction that's being committed + * @return 0 on success, non-zero on failure. + */ +static int +mdb_env_write_meta(MDB_txn *txn) +{ + MDB_env *env; + MDB_meta meta, metab, *mp; + unsigned flags; + size_t mapsize; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; +#ifdef _WIN32 + OVERLAPPED ov; +#else + int r2; +#endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + flags = env->me_flags; + mp = env->me_metas[toggle]; + mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; + /* Persist any increases of mapsize config */ + if (mapsize < env->me_mapsize) + mapsize = env->me_mapsize; + + if (flags & MDB_WRITEMAP) { + mp->mm_mapsize = mapsize; + mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mp->mm_last_pg = txn->mt_next_pgno - 1; +#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ + !(defined(__i386__) || defined(__x86_64__)) + /* LY: issue a memory barrier, if not x86. ITS#7969 */ + __sync_synchronize(); +#endif + mp->mm_txnid = txn->mt_txnid; + if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = (char *)mp - PAGEHDRSZ; +#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + r2 = (ptr - env->me_map) & (env->me_os_psize - 1); + ptr -= r2; + meta_size += r2; +#endif + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; + } + metab.mm_txnid = mp->mm_txnid; + metab.mm_last_pg = mp->mm_last_pg; + + meta.mm_mapsize = mapsize; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + off = offsetof(MDB_meta, mm_mapsize); + ptr = (char *)&meta + off; + len = sizeof(MDB_meta) - off; + off += (char *)mp - env->me_map; + + /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. + * (me_mfd goes to the same file as me_fd, but writing to it + * also syncs to disk. Avoids a separate fdatasync() call.) + */ + mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } +#else +retry_write: + rc = pwrite(mfd, ptr, len, off); +#endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; +#ifndef _WIN32 + if (rc == EINTR) + goto retry_write; +#endif + DPUTS("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. + * Use the non-SYNC fd; we know it will fail anyway. + */ + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; /* Silence warnings. We don't care about pwrite's return value */ +#endif +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + /* MIPS has cache coherency issues, this is a no-op everywhere else */ + CACHEFLUSH(env->me_map + off, len, DCACHE); +done: + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; +} + +/** Check both meta pages to see which one is newer. + * @param[in] env the environment handle + * @return newest #MDB_meta. + */ +static MDB_meta * +mdb_env_pick_meta(const MDB_env *env) +{ + MDB_meta *const *metas = env->me_metas; + return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; +} + +int ESECT +mdb_env_create(MDB_env **env) +{ + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; +#ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; +#endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; +} + +static int ESECT +mdb_env_map(MDB_env *env, void *addr) +{ + MDB_page *p; + unsigned int flags = env->me_flags; +#ifdef _WIN32 + int rc; + HANDLE mh; + LONG sizelo, sizehi; + size_t msize; + + if (flags & MDB_RDONLY) { + /* Don't set explicit map size, use whatever exists */ + msize = 0; + sizelo = 0; + sizehi = 0; + } else { + msize = env->me_mapsize; + sizelo = msize & 0xffffffff; + sizehi = msize >> 16 >> 16; /* only needed on Win64 */ + + /* Windows won't create mappings for zero length files. + * and won't map more than the file size. + * Just set the maxsize right now. + */ + if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) + return ErrCode(); + } + + mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? + PAGE_READWRITE : PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? + FILE_MAP_WRITE : FILE_MAP_READ, + 0, 0, msize, addr); + rc = env->me_map ? 0 : ErrCode(); + CloseHandle(mh); + if (rc) + return rc; +#else + int mmap_flags = MAP_SHARED; + int prot = PROT_READ; +#ifdef MAP_NOSYNC /* Used on FreeBSD */ + if (flags & MDB_NOSYNC) + mmap_flags |= MAP_NOSYNC; +#endif + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); + } + env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + + if (flags & MDB_NORDAHEAD) { + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif /* POSIX_MADV_RANDOM */ +#endif /* MADV_RANDOM */ + } +#endif /* _WIN32 */ + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) + return EBUSY; /* TODO: Make a new MDB_* error code? */ + + p = (MDB_page *)env->me_map; + env->me_metas[0] = METADATA(p); + env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_mapsize(MDB_env *env, size_t size) +{ + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + MDB_meta *meta; + void *old; + if (env->me_txn) + return EINVAL; + meta = mdb_env_pick_meta(env); + if (!size) + size = meta->mm_mapsize; + { + /* Silently round up to minimum if the size is too small */ + size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) +{ + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + CORE_DBS; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) +{ + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) +{ + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +static int ESECT +mdb_fsize(HANDLE fd, size_t *size) +{ +#ifdef _WIN32 + LARGE_INTEGER fsize; + + if (!GetFileSizeEx(fd, &fsize)) + return ErrCode(); + + *size = fsize.QuadPart; +#else + struct stat st; + + if (fstat(fd, &st)) + return ErrCode(); + + *size = st.st_size; +#endif + return MDB_SUCCESS; +} + + +#ifdef _WIN32 +typedef wchar_t mdb_nchar_t; +# define MDB_NAME(str) L##str +# define mdb_name_cpy wcscpy +#else +/** Character type for file names: char on Unix, wchar_t on Windows */ +typedef char mdb_nchar_t; +# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ +# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ +#endif + +/** Filename - string of #mdb_nchar_t[] */ +typedef struct MDB_name { + int mn_len; /**< Length */ + int mn_alloced; /**< True if #mn_val was malloced */ + mdb_nchar_t *mn_val; /**< Contents */ +} MDB_name; + +/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ +static const mdb_nchar_t *const mdb_suffixes[2][2] = { + { MDB_NAME("/data.mdb"), MDB_NAME("") }, + { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } +}; + +#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ + +/** Set up filename + scratch area for filename suffix, for opening files. + * It should be freed with #mdb_fname_destroy(). + * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. + * + * @param[in] path Pathname for #mdb_env_open(). + * @param[in] envflags Whether a subdir and/or lockfile will be used. + * @param[out] fname Resulting filename, with room for a suffix if necessary. + */ +static int ESECT +mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) +{ + int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); + fname->mn_alloced = 0; +#ifdef _WIN32 + return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); +#else + fname->mn_len = strlen(path); + if (no_suffix) + fname->mn_val = (char *) path; + else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { + fname->mn_alloced = 1; + strcpy(fname->mn_val, path); + } + else + return ENOMEM; + return MDB_SUCCESS; +#endif +} + +/** Destroy \b fname from #mdb_fname_init() */ +#define mdb_fname_destroy(fname) \ + do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) + +#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ +# define MDB_CLOEXEC O_CLOEXEC +#else +# define MDB_CLOEXEC 0 +#endif + +/** File type, access mode etc. for #mdb_fopen() */ +enum mdb_fopen_type { +#ifdef _WIN32 + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS +#else + /* A comment in mdb_fopen() explains some O_* flag choices. */ + MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ + MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ + MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ + MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ + /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits + * distinguish otherwise-equal MDB_O_* constants from each other. + */ + MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, + MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ +#endif +}; + +/** Open an LMDB file. + * @param[in] env The LMDB environment. + * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is + * appended if necessary to create the filename, without changing mn_len. + * @param[in] which Determines file type, access mode, etc. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] res Resulting file handle. + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_fopen(const MDB_env *env, MDB_name *fname, + enum mdb_fopen_type which, mdb_mode_t mode, + HANDLE *res) +{ + int rc = MDB_SUCCESS; + HANDLE fd; +#ifdef _WIN32 + DWORD acc, share, disp, attrs; +#else + int flags; +#endif + + if (fname->mn_alloced) /* modifiable copy */ + mdb_name_cpy(fname->mn_val + fname->mn_len, + mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); + + /* The directory must already exist. Usually the file need not. + * MDB_O_META requires the file because we already created it using + * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. + * + * With MDB_O_COPY we do not want the OS to cache the writes, since + * the source data is already in the OS cache. + * + * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) + * to avoid the flock() issues noted under Caveats in lmdb.h. + * Also set it for other filehandles which the user cannot get at + * and close himself, which he may need after fork(). I.e. all but + * me_fd, which programs do use via mdb_env_get_fd(). + */ + +#ifdef _WIN32 + acc = GENERIC_READ|GENERIC_WRITE; + share = FILE_SHARE_READ|FILE_SHARE_WRITE; + disp = OPEN_ALWAYS; + attrs = FILE_ATTRIBUTE_NORMAL; + switch (which) { + case MDB_O_RDONLY: /* read-only datafile */ + acc = GENERIC_READ; + disp = OPEN_EXISTING; + break; + case MDB_O_META: /* for writing metapages */ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; + break; + case MDB_O_COPY: /* mdb_env_copy() & co */ + acc = GENERIC_WRITE; + share = 0; + disp = CREATE_NEW; + attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; + break; + default: break; /* silence gcc -Wswitch (not all enum values handled) */ + } + fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); +#else + fd = open(fname->mn_val, which & MDB_O_MASK, mode); +#endif + + if (fd == INVALID_HANDLE_VALUE) + rc = ErrCode(); +#ifndef _WIN32 + else { + if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { + /* Set CLOEXEC if we could not pass it to open() */ + if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + } + if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { + /* This may require buffer alignment. There is no portable + * way to ask how much, so we require OS pagesize alignment. + */ +# ifdef F_NOCACHE /* __APPLE__ */ + (void) fcntl(fd, F_NOCACHE, 1); +# elif defined O_DIRECT + /* open(...O_DIRECT...) would break on filesystems without + * O_DIRECT support (ITS#7682). Try to set it here instead. + */ + if ((flags = fcntl(fd, F_GETFL)) != -1) + (void) fcntl(fd, F_SETFL, flags | O_DIRECT); +# endif + } + } +#endif /* !_WIN32 */ + + *res = fd; + return rc; +} + + +#ifdef BROKEN_FDATASYNC +#include +#include +#endif + +/** Further setup required for opening an LMDB environment + */ +static int ESECT +mdb_env_open2(MDB_env *env) +{ + unsigned int flags = env->me_flags; + int i, newenv = 0, rc; + MDB_meta meta; + +#ifdef _WIN32 + /* See if we should use QueryLimited */ + rc = GetVersion(); + if ((rc & 0xff) > 5) + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; + else + env->me_pidquery = PROCESS_QUERY_INFORMATION; +#endif /* _WIN32 */ + +#ifdef BROKEN_FDATASYNC + /* ext3/ext4 fdatasync is broken on some older Linux kernels. + * https://lkml.org/lkml/2012/9/3/83 + * Kernels after 3.6-rc6 are known good. + * https://lkml.org/lkml/2012/9/10/556 + * See if the DB is on ext3/ext4, then check for new enough kernel + * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known + * to be patched. + */ + { + struct statfs st; + fstatfs(env->me_fd, &st); + while (st.f_type == 0xEF53) { + struct utsname uts; + int i; + uname(&uts); + if (uts.release[0] < '3') { + if (!strncmp(uts.release, "2.6.32.", 7)) { + i = atoi(uts.release+7); + if (i >= 60) + break; /* 2.6.32.60 and newer is OK */ + } else if (!strncmp(uts.release, "2.6.34.", 7)) { + i = atoi(uts.release+7); + if (i >= 15) + break; /* 2.6.34.15 and newer is OK */ + } + } else if (uts.release[0] == '3') { + i = atoi(uts.release+2); + if (i > 5) + break; /* 3.6 and newer is OK */ + if (i == 5) { + i = atoi(uts.release+4); + if (i >= 4) + break; /* 3.5.4 and newer is OK */ + } else if (i == 2) { + i = atoi(uts.release+4); + if (i >= 30) + break; /* 3.2.30 and newer is OK */ + } + } else { /* 4.x and newer is OK */ + break; + } + env->me_flags |= MDB_FSYNCONLY; + break; + } + } +#endif + + if ((i = mdb_env_read_header(env, &meta)) != 0) { + if (i != ENOENT) + return i; + DPUTS("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + memset(&meta, 0, sizeof(meta)); + mdb_env_init_meta0(env, &meta); + meta.mm_mapsize = DEFAULT_MAPSIZE; + } else { + env->me_psize = meta.mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + env->me_mapsize = meta.mm_mapsize; + } + { + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). + */ + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + meta.mm_mapsize = env->me_mapsize; + + if (newenv && !(flags & MDB_FIXEDMAP)) { + /* mdb_env_map() may grow the datafile. Write the metapages + * first, so the file will be valid if initialization fails. + * Except with FIXEDMAP, since we do not yet know mm_address. + * We could fill in mm_address later, but then a different + * program might end up doing that - one with a memory layout + * and map address which does not suit the main program. + */ + rc = mdb_env_init_meta(env, &meta); + if (rc) + return rc; + newenv = 0; + } + + rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta.mm_address = env->me_map; + i = mdb_env_init_meta(env, &meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif + env->me_maxpg = env->me_mapsize / env->me_psize; + +#if MDB_DEBUG + { + MDB_meta *meta = mdb_env_pick_meta(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + + DPRINTF(("opened database version %u, pagesize %u", + meta->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Z"u", db->md_entries)); + DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); + DPRINTF(("root: %"Z"u", db->md_root)); + } +#endif + + return MDB_SUCCESS; +} + + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the slot in the reader lock table. + */ +static void +mdb_env_reader_dest(void *ptr) +{ + MDB_reader *reader = ptr; + +#ifndef _WIN32 + if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ +#endif + /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ + reader->mr_pid = 0; +} + +#ifdef _WIN32 +/** Junk for arranging thread-specific callbacks on Windows. This is + * necessarily platform and compiler-specific. Windows supports up + * to 1088 keys. Let's assume nobody opens more than 64 environments + * in a single process, for now. They can override this if needed. + */ +#ifndef MAX_TLS_KEYS +#define MAX_TLS_KEYS 64 +#endif +static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; +static int mdb_tls_nkeys; + +static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) +{ + int i; + switch(reason) { + case DLL_PROCESS_ATTACH: break; + case DLL_THREAD_ATTACH: break; + case DLL_THREAD_DETACH: + for (i=0; ime_txns->mti_txnid = meta->mm_txnid; + +#ifdef _WIN32 + { + OVERLAPPED ov; + /* First acquire a shared lock. The Unlock will + * then release the existing exclusive lock. + */ + memset(&ov, 0, sizeof(ov)); + if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + } else { + UnlockFile(env->me_lfd, 0, 0, 1, 0); + *excl = 0; + } + } +#else + { + struct flock lock_info; + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + } +#endif + + return rc; +} + +/** Try to get exclusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int ESECT +mdb_env_excl_lock(MDB_env *env, int *excl) +{ + int rc = 0; +#ifdef _WIN32 + if (LockFile(env->me_lfd, 0, 0, 1, 0)) { + *excl = 1; + } else { + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + *excl = 0; + } else { + rc = ErrCode(); + } + } +#else + struct flock lock_info; + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (!rc) { + *excl = 1; + } else +# ifndef MDB_USE_POSIX_MUTEX + if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ +# endif + { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (rc == 0) + *excl = 0; + } +#endif + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdb_hash_t; +#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdb_hash_t +mdb_hash_val(MDB_val *val, mdb_hash_t hval) +{ + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdb_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void ESECT +mdb_pack85(unsigned long l, char *out) +{ + int i; + + for (i=0; i<5; i++) { + *out++ = mdb_a85[l % 85]; + l /= 85; + } +} + +static void ESECT +mdb_hash_enc(MDB_val *val, char *encbuf) +{ + mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); + + mdb_pack85(h, encbuf); + mdb_pack85(h>>32, encbuf+5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] fname Filename + scratch area, from #mdb_fname_init(). + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) +{ +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#endif + int rc; + off_t size, rsize; + + rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); + if (rc) { + /* Omit lockfile if read-only env on read-only filesystem */ + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + goto fail; + } + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; + } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; +#endif + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) goto fail_errno; +#endif + rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size < rsize && *excl > 0) { +#ifdef _WIN32 + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize + || !SetEndOfFile(env->me_lfd)) + goto fail_errno; +#else + if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; +#endif + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size/sizeof(MDB_reader) + 1; + } + { +#ifdef _WIN32 + HANDLE mh; + mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, + 0, 0, NULL); + if (!mh) goto fail_errno; + env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); + CloseHandle(mh); + if (!env->me_txns) goto fail_errno; +#else + void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, + env->me_lfd, 0); + if (m == MAP_FAILED) goto fail_errno; + env->me_txns = m; +#endif + } + if (*excl > 0) { +#ifdef _WIN32 + BY_HANDLE_FILE_INFORMATION stbuf; + struct { + DWORD volume; + DWORD nhigh; + DWORD nlow; + } idbuf; + MDB_val val; + char encbuf[11]; + + if (!mdb_sec_inited) { + InitializeSecurityDescriptor(&mdb_null_sd, + SECURITY_DESCRIPTOR_REVISION); + SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); + mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); + mdb_all_sa.bInheritHandle = FALSE; + mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; + mdb_sec_inited = 1; + } + if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.volume = stbuf.dwVolumeSerialNumber; + idbuf.nhigh = stbuf.nFileIndexHigh; + idbuf.nlow = stbuf.nFileIndexLow; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); + sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + struct stat stbuf; + struct { + dev_t dev; + ino_t ino; + } idbuf; + MDB_val val; + char encbuf[11]; + +#if defined(__NetBSD__) +#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ +#endif + if (fstat(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.dev = stbuf.st_dev; + idbuf.ino = stbuf.st_ino; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); +#ifdef MDB_SHORT_SEMNAMES + encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ +#endif + sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); + /* Clean up after a previous run, if needed: Try to + * remove both semaphores before doing anything else. + */ + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + env->me_rmutex = sem_open(env->me_txns->mti_rmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#else /* MDB_USE_POSIX_MUTEX: */ + pthread_mutexattr_t mattr; + + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); + memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + + if ((rc = pthread_mutexattr_init(&mattr))) + goto fail; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); +#ifdef MDB_ROBUST_SUPPORTED + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); +#endif + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); + pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; +#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = 0; + env->me_txns->mti_numreaders = 0; + + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + DPUTS("lock region has invalid magic"); + rc = MDB_INVALID; + goto fail; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + DPRINTF(("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT)); + rc = MDB_VERSION_MISMATCH; + goto fail; + } + rc = ErrCode(); + if (rc && rc != EACCES && rc != EAGAIN) { + goto fail; + } +#ifdef _WIN32 + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#endif + } + return MDB_SUCCESS; + +fail_errno: + rc = ErrCode(); +fail: + return rc; +} + + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +# error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int ESECT +mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) +{ + int rc, excl = -1; + MDB_name fname; + + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) + return EINVAL; + + flags |= env->me_flags; + + rc = mdb_fname_init(path, flags, &fname); + if (rc) + return rc; + + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + } + + rc = mdb_fopen(env, &fname, + (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, + mode, &env->me_fd); + if (rc) + goto leave; + + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + } + + if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { + if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); + if (rc) + goto leave; + } + DPRINTF(("opened dbenv %p", (void *) env)); + if (excl > 0) { + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * + (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && + (txn = calloc(1, size))) + { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } + } + +leave: + if (rc) { + mdb_env_close0(env, excl); + } + mdb_fname_destroy(fname); + return rc; +} + +/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ +static void ESECT +mdb_env_close0(MDB_env *env, int excl) +{ + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + + /* Doing this here since me_dbxs may not exist during mdb_env_close */ + if (env->me_dbxs) { + for (i = env->me_maxdbs; --i >= CORE_DBS; ) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs); + } + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_path); + free(env->me_dirty_list); + free(env->me_txn0); + mdb_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + /* Delete our key from the global list */ + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } +#endif + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = getpid(); + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + for (i = env->me_close_readers; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; +#ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + /* Windows automatically destroys the mutexes when + * the last handle closes. + */ +#elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } +#elif defined(MDB_ROBUST_SUPPORTED) + /* If we have the filelock: If we are the + * only remaining user, clean up robust + * mutexes. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + pthread_mutex_destroy(env->me_txns->mti_rmutex); + pthread_mutex_destroy(env->me_txns->mti_wmutex); + } +#endif + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + /* Unlock the lockfile. Windows would have unlocked it + * after closing anyway, but not necessarily at once. + */ + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + (void) close(env->me_lfd); + } + + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); +} + +void ESECT +mdb_env_close(MDB_env *env) +{ + MDB_page *dp; + + if (env == NULL) + return; + + VGMEMP_DESTROY(env); + while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdb_env_close0(env, 0); + free(env); +} + +/** Compare two items pointing at aligned size_t's */ +static int +mdb_cmp_long(const MDB_val *a, const MDB_val *b) +{ + return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : + *(size_t *)a->mv_data > *(size_t *)b->mv_data; +} + +/** Compare two items pointing at aligned unsigned int's. + * + * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, + * but #mdb_cmp_clong() is called instead if the data type is size_t. + */ +static int +mdb_cmp_int(const MDB_val *a, const MDB_val *b) +{ + return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : + *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; +} + +/** Compare two items pointing at unsigned ints of unknown alignment. + * Nodes and keys are guaranteed to be 2-byte aligned. + */ +static int +mdb_cmp_cint(const MDB_val *a, const MDB_val *b) +{ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short *u, *c; + int x; + + u = (unsigned short *) ((char *) a->mv_data + a->mv_size); + c = (unsigned short *) ((char *) b->mv_data + a->mv_size); + do { + x = *--u - *--c; + } while(!x && u > (unsigned short *)a->mv_data); + return x; +#else + unsigned short *u, *c, *end; + int x; + + end = (unsigned short *) ((char *) a->mv_data + a->mv_size); + u = (unsigned short *)a->mv_data; + c = (unsigned short *)b->mv_data; + do { + x = *u++ - *c++; + } while(!x && u < end); + return x; +#endif +} + +/** Compare two items lexically */ +static int +mdb_cmp_memn(const MDB_val *a, const MDB_val *b) +{ + int diff; + ssize_t len_diff; + unsigned int len; + + len = a->mv_size; + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + len = b->mv_size; + len_diff = 1; + } + + diff = memcmp(a->mv_data, b->mv_data, len); + return diff ? diff : len_diff<0 ? -1 : len_diff; +} + +/** Compare two items in reverse byte order */ +static int +mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) +{ + const unsigned char *p1, *p2, *p1_lim; + ssize_t len_diff; + int diff; + + p1_lim = (const unsigned char *)a->mv_data; + p1 = (const unsigned char *)a->mv_data + a->mv_size; + p2 = (const unsigned char *)b->mv_data + b->mv_size; + + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + p1_lim += len_diff; + len_diff = 1; + } + + while (p1 > p1_lim) { + diff = *--p1 - *--p2; + if (diff) + return diff; + } + return len_diff<0 ? -1 : len_diff; +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node * +mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) +{ + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdb_cmp_int. + */ + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); +#if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); +#endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdb_cursor_adjust(MDB_cursor *mc, func) +{ + MDB_cursor *m2; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif + +/** Pop a page off the top of the cursor's stack. */ +static void +mdb_cursor_pop(MDB_cursor *mc) +{ + if (mc->mc_snum) { + DPRINTF(("popping page %"Z"u off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); + + mc->mc_snum--; + if (mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } +} + +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ +static int +mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) +{ + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); + + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) +{ + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). + */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdb_page_search() / #mdb_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int +mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) { + i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } + } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); + } + + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + +ready: + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdb_cursor_first() and #mdb_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) +{ + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { + DPUTS("transaction may not be used now"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; /* not a named DB */ + rc = mdb_node_read(&mc2, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { /* Tree is empty. */ + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); +} + +static int +mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) +{ + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + txn->mt_dirty_room++; + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); +release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] mc The cursor for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) +{ + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + + return MDB_SUCCESS; +} + +int +mdb_get(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_cursor_sibling(MDB_cursor *mc, int move_right) +{ + int rc; + MDB_node *indx; + MDB_page *mp; + + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return rc; + } + + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int +mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) + return MDB_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int +mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + + mc->mc_flags &= ~(C_EOF|C_DEL); + + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (!IS_LEAF(mp)) + return MDB_CORRUPTED; + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int +mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) +{ + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (MP_FLAGS(mp) & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (MP_FLAGS(mp) & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + nkeys-1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys-1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys-1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (MP_FLAGS(mp) & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + mc->mc_flags &= ~C_EOF; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. + */ + for (i=0; imc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } else { + mc->mc_pg[0] = 0; + } + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (data) { + if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val olddata; + MDB_cmp_func *dcmp; + if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) + return rc; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + rc = dcmp(data, &olddata); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + } + *data = olddata; + + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int +mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + if ( key ) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int +mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + if (key) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int +mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) +{ + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + + if (mc == NULL) + return EINVAL; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; +fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdb_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + rc = MDB_NOTFOUND; + break; + } + mc->mc_flags &= ~C_EOF; + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + rc = mdb_node_read(mc, leaf, data); + break; + } + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * @param[in] mc The cursor to operate on. + */ +static int +mdb_cursor_touch(MDB_cursor *mc) +{ + int rc = MDB_SUCCESS; + + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int +mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) +{ + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp, *sub_root = NULL; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (key->mv_size-1 >= ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; +#endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + + if (flags & MDB_CURRENT) { + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + MP_FLAGS(np) |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ + MP_LOWER(fp) = MP_UPPER(fp) = (PAGEHDRSZ-PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); +fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + if (mc->mc_ki[mc->mc_top]) + rc2 = mdb_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top += dtop; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + +more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_cmp_func *dcmp; + /* Just overwrite the current item */ + if (flags == MDB_CURRENT) + goto current; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + /* does data match? */ + if (!dcmp(data, &olddata)) { + if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + MP_FLAGS(fp) = P_LEAF|P_DIRTY|P_SUBP; + MP_LOWER(fp) = (PAGEHDRSZ-PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + MP_FLAGS(fp) |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + MP_UPPER(fp) = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + + data->mv_size); + break; + } + offset = fp->mp_pad; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT: + MP_FLAGS(fp) |= P_DIRTY; + COPY_PGNO(MP_PGNO(fp), MP_PGNO(mp)); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = MP_FLAGS(fp); + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; +prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + sub_root = mp; + } + if (mp != fp) { + MP_FLAGS(mp) = fp_flags | P_DIRTY; + MP_PAD(mp) = MP_PAD(fp); + MP_LOWER(mp) = MP_LOWER(fp); + MP_UPPER(mp) = MP_UPPER(fp) + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + MP_UPPER(mp) + PAGEBASE, (char *)fp + MP_UPPER(fp) + PAGEBASE, + olddata.mv_size - MP_UPPER(fp) - PAGEBASE); + memcpy((char *)MP_PTRS(mp), (char *)MP_PTRS(fp), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); + for (i=0; imp_ptrs[i] += offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdb_node_del(mc, 0); + goto new_sub; + } +current: + /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + if ((leaf->mn_flags ^ flags) & F_SUBDATA) + return MDB_INCOMPATIBLE; + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (level > 1) { + /* It is writable only in a parent txn */ + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ + rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ + if (!(flags & MDB_RESERVE)) { + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -(int)sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy beginning of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + if (key->mv_size != NODEKSZ(leaf)) + goto new_ksize; + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } +new_ksize: + mdb_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + m3->mc_ki[i]++; + } + XCURSOR_REFRESH(m3, i, mp); + } + } + } + + if (rc == MDB_SUCCESS) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. + */ + if (do_sub) { + int xflags, new_dupdata; + size_t ecount; +put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + new_dupdata = (int)dkey.mv_size; + /* converted, write the original data first */ + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + goto bad_sub; + /* we've done our job */ + dkey.mv_size = 0; + } + if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + MDB_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init2(m2, mx, new_dupdata); + } else if (!insert_key) { + XCURSOR_REFRESH(m2, i, mp); + } + } + } + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (rc) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. + */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; +bad_sub: + if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ + rc = MDB_CORRUPTED; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_cursor_del(MDB_cursor *mc, unsigned int flags) +{ + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + + rc = mdb_cursor_touch(mc); + if (rc) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (!IS_LEAF(mp)) + return MDB_CORRUPTED; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdb_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (rc) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + XCURSOR_REFRESH(m2, mc->mc_top, mp); + } + } + } + mc->mc_db->md_entries--; + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto fail; + } + } + /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { + rc = MDB_INCOMPATIBLE; + goto fail; + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + goto fail; + } + +del_key: + return mdb_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) +{ + MDB_page *np; + int rc; + + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ-PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) +{ + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_branch_size(MDB_env *env, MDB_val *key) +{ + size_t sz; + + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + /* put on overflow page */ + /* not implemented */ + /* sz -= key->size - sizeof(pgno_t); */ + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + *
    + *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + *
+ */ +static int +mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) +{ + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + void *ndata; + DKBUF; + + mdb_cassert(mc, MP_UPPER(mp) >= MP_LOWER(mp)); + + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); + + if (IS_LEAF2(mp)) { + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + MP_LOWER(mp) += sizeof(indx_t); + MP_UPPER(mp) -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, key && data); + if (F_ISSET(flags, F_BIGDATA)) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + MP_PTRS(mp)[i] = MP_PTRS(mp)[i - 1]; + + /* Adjust free space offsets. */ + ofs = MP_UPPER(mp) - node_size; + mdb_cassert(mc, ofs >= MP_LOWER(mp) + sizeof(indx_t)); + MP_PTRS(mp)[indx] = ofs; + MP_UPPER(mp) = ofs; + MP_LOWER(mp) += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); + else + SETPGNO(node,pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + ndata = NODEDATA(node); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(ndata, data->mv_data, sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } else { + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = METADATA(ofp); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", MP_UPPER(mp),MP_LOWER(mp),room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void +mdb_node_del(MDB_cursor *mc, int ksize) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + MP_LOWER(mp) -= sizeof(indx_t); + MP_UPPER(mp) += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = MP_PTRS(mp)[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + MP_PTRS(mp)[j] = MP_PTRS(mp)[i]; + if (MP_PTRS(mp)[i] < ptr) + MP_PTRS(mp)[j] += sz; + j++; + } + } + + base = (char *)mp + MP_UPPER(mp) + PAGEBASE; + memmove(base + sz, base, ptr - MP_UPPER(mp)); + + MP_LOWER(mp) -= sizeof(indx_t); + MP_UPPER(mp) += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void +mdb_node_shrink(MDB_page *mp, indx_t indx) +{ + MDB_node *node; + MDB_page *sp, *xp; + char *base; + indx_t delta, nsize, len, ptr; + int i; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + nsize = NODEDSZ(node) - delta; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + len = nsize; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + } else { + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0; ) + MP_PTRS(xp)[i] = MP_PTRS(sp)[i] - delta; + len = PAGEHDRSZ; + } + MP_UPPER(sp) = MP_LOWER(sp); + COPY_PGNO(MP_PGNO(sp), mp->mp_pgno); + SETDSZ(node, nsize); + + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = NUMKEYS(mp); --i >= 0; ) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + */ +static void +mdb_xcursor_init0(MDB_cursor *mc) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void +mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = 0; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, MP_PGNO(fp)); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbx.md_cmp = mdb_cmp_clong; +#endif +} + + +/** Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * @param[in] src_mx The xcursor of an up-to-date cursor. + * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. + */ +static void +mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; +#endif + } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { + return; + } + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); +} + +/** Initialize a cursor for a given transaction and database. */ +static void +mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) +{ + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_ki[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int +mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) +{ + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int +mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) +{ + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int +mdb_cursor_count(MDB_cursor *mc, size_t *countp) +{ + MDB_node *leaf; + + if (mc == NULL || countp == NULL) + return EINVAL; + + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (!mc->mc_snum) + return MDB_NOTFOUND; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; +} + +void +mdb_cursor_close(MDB_cursor *mc) +{ + if (mc && !mc->mc_backup) { + /* remove from txn, if tracked */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); + } +} + +MDB_txn * +mdb_cursor_txn(MDB_cursor *mc) +{ + if (!mc) return NULL; + return mc->mc_txn; +} + +MDB_dbi +mdb_cursor_dbi(MDB_cursor *mc) +{ + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_update_key(MDB_cursor *mc, MDB_val *key) +{ + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } +#endif + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ +} while (0) + +/** Move a node from csrc to cdst. + */ +static int +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) +{ + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(csrc); + if (rc) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + mn.mc_xcursor = NULL; + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; + } + + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + /* Add the node to the destination page. + */ + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) + return rc; + + /* Delete the node from the source page. + */ + mdb_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (fromleft) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && + m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 !=csrc && + m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]++; + } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + } + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. + */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(cdst, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) +{ + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdb_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if ((rc = mdb_page_touch(cdst))) + return rc; + + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdb_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdb_page_loose(csrc, psrc); + if (rc) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + unsigned int top = csrc->mc_top; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top-1] = cdst->mc_ki[top-1]; + } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && + m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { + m3->mc_ki[top-1]--; + } + if (IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); + } + } + { + unsigned int snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdb_cursor_pop(cdst); + rc = mdb_rebalance(cdst); + /* Did the tree height change? */ + if (depth != cdst->mc_db->md_depth) + snum += cdst->mc_db->md_depth - depth; + cdst->mc_snum = snum; + cdst->mc_top = snum-1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) +{ + unsigned int i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i=0; imc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rebalance(MDB_cursor *mc) +{ + MDB_node *node; + int rc, fromleft; + unsigned int ptop, minkeys, thresh; + MDB_cursor mn; + indx_t oldki; + + if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { + minkeys = 2; + thresh = 1; + } else { + minkeys = 1; + thresh = FILL_THRESHOLD; + } + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; imc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i+1]; + mc->mc_ki[i] = mc->mc_ki[i+1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (i=0; imc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; + } else { + /* There is at least one neighbor to the left. + */ + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc, fromleft); + if (fromleft) { + /* if we inserted on left, bump position up */ + oldki++; + } + } else { + if (!fromleft) { + rc = mdb_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_merge(mc, &mn)); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdb_cursor_del(). */ +static int +mdb_cursor_del0(MDB_cursor *mc) +{ + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + { + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + XCURSOR_REFRESH(m3, mc->mc_top, mp); + } + } + } + rc = mdb_rebalance(mc); + if (rc) + goto fail; + + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdb_rebalance and aren't needed here. + */ + if (!mc->mc_snum) { + mc->mc_flags |= C_EOF; + return rc; + } + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + if (rc) + goto fail; + } + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + mdb_xcursor_init1(m3, node); + rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (rc) + goto fail; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + } + } + } + mc->mc_flags |= C_DEL; + +fail: + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_del(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } + + return mdb_del0(txn, dbi, key, data, 0); +} + +static int +mdb_del0(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact = 0; + DKBUF; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + mdb_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + flags |= MDB_NODUPDATA; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + /* let mdb_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, + unsigned int nflags) +{ + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + + /* Create a right sibling. */ + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + rp->mp_pad = mp->mp_pad; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + new_root = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + } else { + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + } + } else { + int psize, nsize, k, keythresh; + + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + /* Threshold number of keys considered "small" */ + keythresh = env->me_psize >> 7; + + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ-PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdb_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < keythresh || nsize > pmax/16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + /* Copy separator key to the parent. + */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + if (rc) + goto done; + + /* root split? */ + if (mc->mc_snum > snum) { + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + mdb_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + if (rc != MDB_SUCCESS) { + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i=0; imc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (new_root) { + int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (k=new_root; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= nkeys) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; imc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + if (IS_LEAF(mp)) + XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); + } + } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + +done: + if (copy) /* tmp page */ + mdb_page_free(env, copy); + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_put(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned int flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + int rc; + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + return rc; +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024*1024) +#endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ + + /** State needed for a double-buffering compacting copy. */ +typedef struct mdb_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + char *mc_wbuf[2]; + char *mc_over[2]; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; +} mdb_copy; + + /** Dedicated writer thread for compacting copy. */ +static THREAD_RET ESECT CALL_CONV +mdb_env_copythr(void *arg) +{ + mdb_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc; +#ifdef _WIN32 + DWORD len; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + int len; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif +#endif + + pthread_mutex_lock(&my->mc_mutex); + for(;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + break; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; +again: + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { + DO_WRITE(rc, my->mc_fd, ptr, wsize, len); + if (!rc) { + rc = ErrCode(); +#if defined(SIGPIPE) && !defined(_WIN32) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_error = rc; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; + pthread_cond_signal(&my->mc_cond); + } + pthread_mutex_unlock(&my->mc_mutex); + return (THREAD_RET)0; +#undef DO_WRITE +} + + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ +static int ESECT +mdb_env_cthr_toggle(mdb_copy *my, int adjust) +{ + pthread_mutex_lock(&my->mc_mutex); + my->mc_new += adjust; + pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + pthread_mutex_unlock(&my->mc_mutex); + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; +} + + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ +static int ESECT +mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) +{ + MDB_cursor mc = {0}; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned int i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_txn = my->mc_txn; + + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i=0; imc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdb_page_get(&mc, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; +again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(&mc, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdb_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); + SETPGNO(ni, mo->mp_pgno); + mdb_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + + /** Copy environment with compaction. */ +static int ESECT +mdb_env_copyfd1(MDB_env *env, HANDLE fd) +{ + MDB_meta *mm; + MDB_page *mp; + mdb_copy my = {0}; + MDB_txn *txn = NULL; + pthread_t thr; + pgno_t root, new_root; + int rc = MDB_SUCCESS; + +#ifdef _WIN32 + if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || + !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { + rc = ErrCode(); + goto done; + } + my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); + if (my.mc_wbuf[0] == NULL) { + /* _aligned_malloc() sets errno, but we use Windows error codes */ + rc = ERROR_NOT_ENOUGH_MEMORY; + goto done; + } +#else + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; +#ifdef HAVE_MEMALIGN + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } +#else + { + void *p; + if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) + goto done; + my.mc_wbuf[0] = p; + } +#endif +#endif + memset(my.mc_wbuf[0], 0, MDB_WBUF*2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_next_pgno = NUM_METAS; + my.mc_env = env; + my.mc_fd = fd; + rc = THREAD_CREATE(thr, mdb_env_copythr, &my); + if (rc) + goto done; + + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto finish; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, NUM_METAS * env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)METADATA(mp); + mdb_env_init_meta0(env, mm); + mm->mm_address = env->me_metas[0]->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)METADATA(mp) = *mm; + mm = (MDB_meta *)METADATA(mp); + + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + + my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_txn = txn; + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } + +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = THREAD_FINISH(thr); + mdb_txn_abort(txn); + +done: +#ifdef _WIN32 + if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); + if (my.mc_cond) CloseHandle(my.mc_cond); + if (my.mc_mutex) CloseHandle(my.mc_mutex); +#else + free(my.mc_wbuf[0]); + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); +#endif + return rc ? rc : my.mc_error; +} + + /** Copy environment as-is. */ +static int ESECT +mdb_env_copyfd0(MDB_env *env, HANDLE fd) +{ + MDB_txn *txn = NULL; + mdb_mutexref_t wmutex = NULL; + int rc; + size_t wsize, w3; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_end(txn, MDB_END_RESET_TMP); + + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = env->me_wmutex; + if (LOCK_MUTEX(rc, env, wmutex)) + goto leave; + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX(wmutex); + goto leave; + } + } + + wsize = env->me_psize * NUM_METAS; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + if (wmutex) + UNLOCK_MUTEX(wmutex); + + if (rc) + goto leave; + + w3 = txn->mt_next_pgno * env->me_psize; + { + size_t fsize = 0; + if ((rc = mdb_fsize(env->me_fd, &fsize))) + goto leave; + if (w3 > fsize) + w3 = fsize; + } + wsize = w3 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int ESECT +mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) +{ + if (flags & MDB_CP_COMPACT) + return mdb_env_copyfd1(env, fd); + else + return mdb_env_copyfd0(env, fd); +} + +int ESECT +mdb_env_copyfd(MDB_env *env, HANDLE fd) +{ + return mdb_env_copyfd2(env, fd, 0); +} + +int ESECT +mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) +{ + int rc; + MDB_name fname; + HANDLE newfd = INVALID_HANDLE_VALUE; + + rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); + if (rc == MDB_SUCCESS) { + rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); + mdb_fname_destroy(fname); + } + if (rc == MDB_SUCCESS) { + rc = mdb_env_copyfd2(env, newfd, flags); + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + } + return rc; +} + +int ESECT +mdb_env_copy(MDB_env *env, const char *path) +{ + return mdb_env_copy2(env, path, 0); +} + +int ESECT +mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) +{ + if (flag & ~CHANGEABLE) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_flags(MDB_env *env, unsigned int *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_flags & (CHANGEABLE|CHANGELESS); + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_userctx(MDB_env *env, void *ctx) +{ + if (!env) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void * ESECT +mdb_env_get_userctx(MDB_env *env) +{ + return env ? env->me_userctx : NULL; +} + +int ESECT +mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) +{ + if (!env) + return EINVAL; +#ifndef NDEBUG + env->me_assert_func = func; +#endif + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_path(MDB_env *env, const char **arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdb_stat() and #mdb_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int ESECT +mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) +{ + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_stat(MDB_env *env, MDB_stat *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + + return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); +} + +int ESECT +mdb_env_info(MDB_env *env, MDB_envinfo *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + arg->me_mapaddr = meta->mm_address; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_last_txnid = meta->mm_txnid; + + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; + return MDB_SUCCESS; +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdb_set_compare() or + * #mdb_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +static void +mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) +{ + uint16_t f = txn->mt_dbs[dbi].md_flags; + + txn->mt_dbxs[dbi].md_cmp = + (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : + (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; + + txn->mt_dbxs[dbi].md_dcmp = + !(f & MDB_DUPSORT) ? 0 : + ((f & MDB_INTEGERDUP) + ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) + : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); +} + +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) +{ + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned int unused = 0, seq; + char *namedup; + size_t len; + + if (flags & ~VALID_FLAGS) + return EINVAL; + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i=CORE_DBS; imt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW|DB_VALID|DB_USRVALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; + } else { + if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) + return rc; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + } + + /* Done here so we cannot fail after creating a new DB */ + if ((namedup = strdup(name)) == NULL) + return ENOMEM; + + if (rc) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); + dbflag |= DB_DIRTY; + } + + if (rc) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int ESECT +mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +{ + if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) +{ + char *ptr; + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) +{ + /* We could return the flags for the FREE_DBI too but what's the point? */ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_drop0(MDB_cursor *mc, int subs) +{ + int rc; + + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. + */ + if ((mc->mc_flags & C_SUB) || + (!subs && !mc->mc_db->md_overflow_pages)) + mdb_cursor_pop(mc); + + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(mc, pg, &omp, NULL); + if (rc != 0) + goto done; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto done; + } + } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + goto done; + for (i=0; imt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + if (rc != MDB_NOTFOUND) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ +pop: + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); +done: + if (rc) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) +{ + MDB_cursor *mc, *m2; + int rc; + + if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + + if (TXN_DBI_CHANGED(txn, dbi)) + return MDB_BAD_DBI; + + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdb_cursor_close(mc); + return rc; +} + +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxkeysize(MDB_env *env) +{ + return ENV_MAXKEY(env); +} + +int ESECT +mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +{ + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (!env || !func) + return -1; + if (!env->me_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; i> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if( val > 0 ) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n-1]; + ids[n] = pid; + return 0; +} + +int ESECT +mdb_reader_check(MDB_env *env, int *dead) +{ + if (!env) + return EINVAL; + if (dead) + *dead = 0; + return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; +} + +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +static int ESECT +mdb_reader_check0(MDB_env *env, int rlocked, int *dead) +{ + mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; + unsigned int i, j, rdrs; + MDB_reader *mr; + MDB_PID_T *pids, pid; + int rc = MDB_SUCCESS, count = 0; + + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; ime_pid) { + if (mdb_pid_insert(pids, pid) == 0) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { + /* Stale reader found */ + j = i; + if (rmutex) { + if ((rc = LOCK_MUTEX0(rmutex)) != 0) { + if ((rc = mdb_mutex_failed(env, rmutex, rc))) + break; + rdrs = 0; /* the above checked all readers */ + } else { + /* Recheck, a new process may have reused pid */ + if (mdb_reader_pid(env, Pidcheck, pid)) + j = rdrs; + } + } + for (; jme_rmutex); + if (!rlocked) { + /* Keep mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + */ + meta = mdb_env_pick_meta(env); + env->me_txns->mti_txnid = meta->mm_txnid; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering"))); + rc2 = mdb_reader_check0(env, rlocked, NULL); + if (rc2 == 0) + rc2 = mdb_mutex_consistent(mutex); + if (rc || (rc = rc2)) { + DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); + UNLOCK_MUTEX(mutex); + } + } else { +#ifdef _WIN32 + rc = ErrCode(); +#endif + DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); + } + + return rc; +} +#endif /* MDB_ROBUST_SUPPORTED */ + +#if defined(_WIN32) +/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ +static int ESECT +utf8_to_utf16(const char *src, MDB_name *dst, int xtra) +{ + int rc, need = 0; + wchar_t *result = NULL; + for (;;) { /* malloc result, then fill it in */ + need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); + if (!need) { + rc = ErrCode(); + free(result); + return rc; + } + if (!result) { + result = malloc(sizeof(wchar_t) * (need + xtra)); + if (!result) + return ENOMEM; + continue; + } + dst->mn_alloced = 1; + dst->mn_len = need - 1; + dst->mn_val = result; + return MDB_SUCCESS; + } +} +#endif /* defined(_WIN32) */ +/** @} */ diff --git a/c/third_party/lmdb/libraries/liblmdb/midl.c b/c/third_party/lmdb/libraries/liblmdb/midl.c new file mode 100644 index 0000000..b0ea538 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/midl.c @@ -0,0 +1,359 @@ +/** @file midl.c + * @brief ldap bdb back-end ID List functions */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2021 The OpenLDAP Foundation. + * Portions Copyright 2001-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include +#include +#include +#include +#include +#include "midl.h" + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup idls ID List Management + * @{ + */ +#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) + +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( ids[cursor], id ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +#if 0 /* superseded by append/sort */ +int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) +{ + unsigned x, i; + + x = mdb_midl_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0] && ids[x] == id ) { + /* duplicate */ + assert(0); + return -1; + } + + if ( ++ids[0] >= MDB_IDL_DB_MAX ) { + /* no room */ + --ids[0]; + return -2; + + } else { + /* insert id */ + for (i=ids[0]; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = id; + } + + return 0; +} +#endif + +MDB_IDL mdb_midl_alloc(int num) +{ + MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +void mdb_midl_free(MDB_IDL ids) +{ + if (ids) + free(ids-1); +} + +void mdb_midl_shrink( MDB_IDL *idp ) +{ + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) + { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } +} + +static int mdb_midl_grow( MDB_IDL *idp, int num ) +{ + MDB_IDL idn = *idp-1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +int mdb_midl_need( MDB_IDL *idp, unsigned num ) +{ + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdb_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) +{ + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) +{ + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } + +void +mdb_midl_sort( MDB_IDL ids ) +{ + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int)*CHAR_BIT * 2]; + int i,j,k,l,ir,jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for(;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j=l+1;j<=ir;j++) { + a = ids[j]; + for (i=j-1;i>=1;i--) { + if (ids[i] >= a) break; + ids[i+1] = ids[i]; + } + ids[i+1] = a; + } + if (jstack == 0) break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l+1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l+1] < ids[ir]) { + MIDL_SWAP(ids[l+1], ids[ir]); + } + if (ids[l] < ids[l+1]) { + MIDL_SWAP(ids[l], ids[l+1]); + } + i = l+1; + j = ir; + a = ids[l+1]; + for(;;) { + do i++; while(ids[i] > a); + do j--; while(ids[j] < a); + if (j < i) break; + MIDL_SWAP(ids[i],ids[j]); + } + ids[l+1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir-i+1 >= j-l) { + istack[jstack] = ir; + istack[jstack-1] = i; + ir = j-1; + } else { + istack[jstack] = j-1; + istack[jstack-1] = l; + l = i; + } + } + } +} + +unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) +{ + unsigned x, i; + + x = mdb_mid2l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + if ( ids[0].mid >= MDB_IDL_UM_MAX ) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + } + + return 0; +} + +int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) +{ + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +/** @} */ +/** @} */ diff --git a/c/third_party/lmdb/libraries/liblmdb/midl.h b/c/third_party/lmdb/libraries/liblmdb/midl.h new file mode 100644 index 0000000..dd6ae77 --- /dev/null +++ b/c/third_party/lmdb/libraries/liblmdb/midl.h @@ -0,0 +1,186 @@ +/** @file midl.h + * @brief LMDB ID List header file. + * + * This file was originally part of back-bdb but has been + * modified for use in libmdb. Most of the macros defined + * in this file are unused, just left over from the original. + * + * This file is only used internally in libmdb and its definitions + * are not exposed publicly. + */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2021 The OpenLDAP Foundation. + * Portions Copyright 2001-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _MDB_MIDL_H_ +#define _MDB_MIDL_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ + +/** @defgroup idls ID List Management + * @{ + */ + /** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. + */ +typedef size_t MDB_ID; + + /** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ +typedef MDB_ID *MDB_IDL; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(ID), thread stack size + */ +#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDB_IDL_DB_SIZE (1< +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_fbuffer FILE* buffer + * @ingroup msgpack_buffer + * @{ + */ + +static inline int msgpack_fbuffer_write(void* data, const char* buf, size_t len) +{ + assert(buf || len == 0); + if(!buf) return 0; + + return (1 == fwrite(buf, len, 1, (FILE *)data)) ? 0 : -1; +} + +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/fbuffer.h */ diff --git a/c/third_party/msgpack/include/msgpack/gcc_atomic.h b/c/third_party/msgpack/include/msgpack/gcc_atomic.h new file mode 100644 index 0000000..6b1b1a7 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/gcc_atomic.h @@ -0,0 +1,25 @@ +/* + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ + +#ifndef MSGPACK_GCC_ATOMIC_H +#define MSGPACK_GCC_ATOMIC_H + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef int _msgpack_atomic_counter_t; + +int _msgpack_sync_decr_and_fetch(volatile _msgpack_atomic_counter_t* ptr); +int _msgpack_sync_incr_and_fetch(volatile _msgpack_atomic_counter_t* ptr); + + +#if defined(__cplusplus) +} +#endif + + +#endif // MSGPACK_GCC_ATOMIC_H diff --git a/c/third_party/msgpack/include/msgpack/object.h b/c/third_party/msgpack/include/msgpack/object.h new file mode 100644 index 0000000..b92b985 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/object.h @@ -0,0 +1,151 @@ +/* + * MessagePack for C dynamic typing routine + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_OBJECT_H +#define MSGPACK_OBJECT_H + +#include "zone.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_object Dynamically typed object + * @ingroup msgpack + * @{ + */ + +typedef enum { + MSGPACK_OBJECT_NIL = 0x00, + MSGPACK_OBJECT_BOOLEAN = 0x01, + MSGPACK_OBJECT_POSITIVE_INTEGER = 0x02, + MSGPACK_OBJECT_NEGATIVE_INTEGER = 0x03, + MSGPACK_OBJECT_FLOAT32 = 0x0a, + MSGPACK_OBJECT_FLOAT64 = 0x04, + MSGPACK_OBJECT_FLOAT = 0x04, +#if defined(MSGPACK_USE_LEGACY_NAME_AS_FLOAT) + MSGPACK_OBJECT_DOUBLE = MSGPACK_OBJECT_FLOAT, /* obsolete */ +#endif /* MSGPACK_USE_LEGACY_NAME_AS_FLOAT */ + MSGPACK_OBJECT_STR = 0x05, + MSGPACK_OBJECT_ARRAY = 0x06, + MSGPACK_OBJECT_MAP = 0x07, + MSGPACK_OBJECT_BIN = 0x08, + MSGPACK_OBJECT_EXT = 0x09 +} msgpack_object_type; + + +struct msgpack_object; +struct msgpack_object_kv; + +typedef struct { + uint32_t size; + struct msgpack_object* ptr; +} msgpack_object_array; + +typedef struct { + uint32_t size; + struct msgpack_object_kv* ptr; +} msgpack_object_map; + +typedef struct { + uint32_t size; + const char* ptr; +} msgpack_object_str; + +typedef struct { + uint32_t size; + const char* ptr; +} msgpack_object_bin; + +typedef struct { + int8_t type; + uint32_t size; + const char* ptr; +} msgpack_object_ext; + +typedef union { + bool boolean; + uint64_t u64; + int64_t i64; +#if defined(MSGPACK_USE_LEGACY_NAME_AS_FLOAT) + double dec; /* obsolete*/ +#endif /* MSGPACK_USE_LEGACY_NAME_AS_FLOAT */ + double f64; + msgpack_object_array array; + msgpack_object_map map; + msgpack_object_str str; + msgpack_object_bin bin; + msgpack_object_ext ext; +} msgpack_object_union; + +typedef struct msgpack_object { + msgpack_object_type type; + msgpack_object_union via; +} msgpack_object; + +typedef struct msgpack_object_kv { + msgpack_object key; + msgpack_object val; +} msgpack_object_kv; + +MSGPACK_DLLEXPORT +void msgpack_object_init_nil(msgpack_object* d); + +MSGPACK_DLLEXPORT +void msgpack_object_init_boolean(msgpack_object* d, bool v); + +MSGPACK_DLLEXPORT +void msgpack_object_init_unsigned_integer(msgpack_object* d, uint64_t v); + +MSGPACK_DLLEXPORT +void msgpack_object_init_signed_integer(msgpack_object* d, int64_t v); + +MSGPACK_DLLEXPORT +void msgpack_object_init_float32(msgpack_object* d, float v); + +MSGPACK_DLLEXPORT +void msgpack_object_init_float64(msgpack_object* d, double v); + +MSGPACK_DLLEXPORT +void msgpack_object_init_str(msgpack_object* d, const char* data, uint32_t size); + +MSGPACK_DLLEXPORT +void msgpack_object_init_bin(msgpack_object* d, const char* data, uint32_t size); + +MSGPACK_DLLEXPORT +void msgpack_object_init_ext(msgpack_object* d, int8_t type, const char* data, uint32_t size); + +MSGPACK_DLLEXPORT +void msgpack_object_init_array(msgpack_object* d, msgpack_object* data, uint32_t size); + +MSGPACK_DLLEXPORT +void msgpack_object_init_map(msgpack_object* d, msgpack_object_kv* data, uint32_t size); + +#if !defined(_KERNEL_MODE) +MSGPACK_DLLEXPORT +void msgpack_object_print(FILE* out, msgpack_object o); +#endif + +MSGPACK_DLLEXPORT +int msgpack_object_print_buffer(char *buffer, size_t buffer_size, msgpack_object o); + +MSGPACK_DLLEXPORT +bool msgpack_object_equal(const msgpack_object x, const msgpack_object y); + +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/object.h */ diff --git a/c/third_party/msgpack/include/msgpack/pack.h b/c/third_party/msgpack/include/msgpack/pack.h new file mode 100644 index 0000000..08ab84b --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/pack.h @@ -0,0 +1,174 @@ +/* + * MessagePack for C packing routine + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_PACK_H +#define MSGPACK_PACK_H + +#include "pack_define.h" +#include "object.h" +#include "timestamp.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_buffer Buffers + * @ingroup msgpack + * @{ + * @} + */ + +/** + * @defgroup msgpack_pack Serializer + * @ingroup msgpack + * @{ + */ + +typedef int (*msgpack_packer_write)(void* data, const char* buf, size_t len); + +typedef struct msgpack_packer { + void* data; + msgpack_packer_write callback; +} msgpack_packer; + +static void msgpack_packer_init(msgpack_packer* pk, void* data, msgpack_packer_write callback); + +static msgpack_packer* msgpack_packer_new(void* data, msgpack_packer_write callback); +static void msgpack_packer_free(msgpack_packer* pk); + +static int msgpack_pack_char(msgpack_packer* pk, char d); + +static int msgpack_pack_signed_char(msgpack_packer* pk, signed char d); +static int msgpack_pack_short(msgpack_packer* pk, short d); +static int msgpack_pack_int(msgpack_packer* pk, int d); +static int msgpack_pack_long(msgpack_packer* pk, long d); +static int msgpack_pack_long_long(msgpack_packer* pk, long long d); +static int msgpack_pack_unsigned_char(msgpack_packer* pk, unsigned char d); +static int msgpack_pack_unsigned_short(msgpack_packer* pk, unsigned short d); +static int msgpack_pack_unsigned_int(msgpack_packer* pk, unsigned int d); +static int msgpack_pack_unsigned_long(msgpack_packer* pk, unsigned long d); +static int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d); + +static int msgpack_pack_uint8(msgpack_packer* pk, uint8_t d); +static int msgpack_pack_uint16(msgpack_packer* pk, uint16_t d); +static int msgpack_pack_uint32(msgpack_packer* pk, uint32_t d); +static int msgpack_pack_uint64(msgpack_packer* pk, uint64_t d); +static int msgpack_pack_int8(msgpack_packer* pk, int8_t d); +static int msgpack_pack_int16(msgpack_packer* pk, int16_t d); +static int msgpack_pack_int32(msgpack_packer* pk, int32_t d); +static int msgpack_pack_int64(msgpack_packer* pk, int64_t d); + +static int msgpack_pack_fix_uint8(msgpack_packer* pk, uint8_t d); +static int msgpack_pack_fix_uint16(msgpack_packer* pk, uint16_t d); +static int msgpack_pack_fix_uint32(msgpack_packer* pk, uint32_t d); +static int msgpack_pack_fix_uint64(msgpack_packer* pk, uint64_t d); +static int msgpack_pack_fix_int8(msgpack_packer* pk, int8_t d); +static int msgpack_pack_fix_int16(msgpack_packer* pk, int16_t d); +static int msgpack_pack_fix_int32(msgpack_packer* pk, int32_t d); +static int msgpack_pack_fix_int64(msgpack_packer* pk, int64_t d); + +static int msgpack_pack_float(msgpack_packer* pk, float d); +static int msgpack_pack_double(msgpack_packer* pk, double d); + +static int msgpack_pack_nil(msgpack_packer* pk); +static int msgpack_pack_true(msgpack_packer* pk); +static int msgpack_pack_false(msgpack_packer* pk); + +static int msgpack_pack_array(msgpack_packer* pk, size_t n); + +static int msgpack_pack_map(msgpack_packer* pk, size_t n); + +static int msgpack_pack_str(msgpack_packer* pk, size_t l); +static int msgpack_pack_str_body(msgpack_packer* pk, const void* b, size_t l); +static int msgpack_pack_str_with_body(msgpack_packer* pk, const void* b, size_t l); + +static int msgpack_pack_v4raw(msgpack_packer* pk, size_t l); +static int msgpack_pack_v4raw_body(msgpack_packer* pk, const void* b, size_t l); + +static int msgpack_pack_bin(msgpack_packer* pk, size_t l); +static int msgpack_pack_bin_body(msgpack_packer* pk, const void* b, size_t l); +static int msgpack_pack_bin_with_body(msgpack_packer* pk, const void* b, size_t l); + +static int msgpack_pack_ext(msgpack_packer* pk, size_t l, int8_t type); +static int msgpack_pack_ext_body(msgpack_packer* pk, const void* b, size_t l); +static int msgpack_pack_ext_with_body(msgpack_packer* pk, const void* b, size_t l, int8_t type); + +static int msgpack_pack_timestamp(msgpack_packer* pk, const msgpack_timestamp* d); + +MSGPACK_DLLEXPORT +int msgpack_pack_object(msgpack_packer* pk, msgpack_object d); + + +/** @} */ + + +#define msgpack_pack_inline_func(name) \ + inline int msgpack_pack ## name + +#define msgpack_pack_inline_func_cint(name) \ + inline int msgpack_pack ## name + +#define msgpack_pack_inline_func_fixint(name) \ + inline int msgpack_pack_fix ## name + +#define msgpack_pack_user msgpack_packer* + +#define msgpack_pack_append_buffer(user, buf, len) \ + return (*(user)->callback)((user)->data, (const char*)buf, len) + +#include "pack_template.h" + +inline void msgpack_packer_init(msgpack_packer* pk, void* data, msgpack_packer_write callback) +{ + pk->data = data; + pk->callback = callback; +} + +inline msgpack_packer* msgpack_packer_new(void* data, msgpack_packer_write callback) +{ + msgpack_packer* pk = (msgpack_packer*)calloc(1, sizeof(msgpack_packer)); + if(!pk) { return NULL; } + msgpack_packer_init(pk, data, callback); + return pk; +} + +inline void msgpack_packer_free(msgpack_packer* pk) +{ + free(pk); +} + +inline int msgpack_pack_str_with_body(msgpack_packer* pk, const void* b, size_t l) + { + int ret = msgpack_pack_str(pk, l); + if (ret != 0) { return ret; } + return msgpack_pack_str_body(pk, b, l); + } + + inline int msgpack_pack_bin_with_body(msgpack_packer* pk, const void* b, size_t l) + { + int ret = msgpack_pack_bin(pk, l); + if (ret != 0) { return ret; } + return msgpack_pack_bin_body(pk, b, l); + } + + inline int msgpack_pack_ext_with_body(msgpack_packer* pk, const void* b, size_t l, int8_t type) + { + int ret = msgpack_pack_ext(pk, l, type); + if (ret != 0) { return ret; } + return msgpack_pack_ext_body(pk, b, l); + } + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/pack.h */ diff --git a/c/third_party/msgpack/include/msgpack/pack_define.h b/c/third_party/msgpack/include/msgpack/pack_define.h new file mode 100644 index 0000000..ce98b67 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/pack_define.h @@ -0,0 +1,18 @@ +/* + * MessagePack unpacking routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_PACK_DEFINE_H +#define MSGPACK_PACK_DEFINE_H + +#include "msgpack/sysdep.h" +#include +#include + +#endif /* msgpack/pack_define.h */ + diff --git a/c/third_party/msgpack/include/msgpack/pack_template.h b/c/third_party/msgpack/include/msgpack/pack_template.h new file mode 100644 index 0000000..8a4efd9 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/pack_template.h @@ -0,0 +1,943 @@ +/* + * MessagePack packing routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ + +#ifndef MSGPACK_ENDIAN_BIG_BYTE +#define MSGPACK_ENDIAN_BIG_BYTE 0 +#endif +#ifndef MSGPACK_ENDIAN_LITTLE_BYTE +#define MSGPACK_ENDIAN_LITTLE_BYTE 1 +#endif + +#if MSGPACK_ENDIAN_LITTLE_BYTE +#define TAKE8_8(d) ((uint8_t*)&d)[0] +#define TAKE8_16(d) ((uint8_t*)&d)[0] +#define TAKE8_32(d) ((uint8_t*)&d)[0] +#define TAKE8_64(d) ((uint8_t*)&d)[0] +#elif MSGPACK_ENDIAN_BIG_BYTE +#define TAKE8_8(d) ((uint8_t*)&d)[0] +#define TAKE8_16(d) ((uint8_t*)&d)[1] +#define TAKE8_32(d) ((uint8_t*)&d)[3] +#define TAKE8_64(d) ((uint8_t*)&d)[7] +#else +#error msgpack-c supports only big endian and little endian +#endif + +#ifndef msgpack_pack_inline_func +#error msgpack_pack_inline_func template is not defined +#endif + +#ifndef msgpack_pack_user +#error msgpack_pack_user type is not defined +#endif + +#ifndef msgpack_pack_append_buffer +#error msgpack_pack_append_buffer callback is not defined +#endif + +#if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4204) /* nonstandard extension used: non-constant aggregate initializer */ +#endif + +/* + * Integer + */ + +#define msgpack_pack_real_uint8(x, d) \ +do { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_8(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ +} while(0) + +#define msgpack_pack_real_uint16(x, d) \ +do { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ + } else if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ +} while(0) + +#define msgpack_pack_real_uint32(x, d) \ +do { \ + if(d < (1<<8)) { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else { \ + if(d < (1<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_uint64(x, d) \ +do { \ + if(d < (1ULL<<8)) { \ + if(d < (1ULL<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else { \ + if(d < (1ULL<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else if(d < (1ULL<<32)) { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else { \ + /* unsigned 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int8(x, d) \ +do { \ + if(d < -(1<<5)) { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_8(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ + } \ +} while(0) + +#define msgpack_pack_real_int16(x, d) \ +do { \ + if(d < -(1<<5)) { \ + if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ + } else { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int32(x, d) \ +do { \ + if(d < -(1<<5)) { \ + if(d < -(1<<15)) { \ + /* signed 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ + } else { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else if(d < (1<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int64(x, d) \ +do { \ + if(d < -(1LL<<5)) { \ + if(d < -(1LL<<15)) { \ + if(d < -(1LL<<31)) { \ + /* signed 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xd3; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } else { \ + /* signed 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } else { \ + if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ + } else { \ + if(d < (1LL<<16)) { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ + } else { \ + if(d < (1LL<<32)) { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else { \ + /* unsigned 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } \ + } \ + } \ +} while(0) + +#ifdef msgpack_pack_inline_func_fixint + +msgpack_pack_inline_func_fixint(_uint8)(msgpack_pack_user x, uint8_t d) +{ + unsigned char buf[2] = {0xcc, TAKE8_8(d)}; + msgpack_pack_append_buffer(x, buf, 2); +} + +msgpack_pack_inline_func_fixint(_uint16)(msgpack_pack_user x, uint16_t d) +{ + unsigned char buf[3]; + buf[0] = 0xcd; _msgpack_store16(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 3); +} + +msgpack_pack_inline_func_fixint(_uint32)(msgpack_pack_user x, uint32_t d) +{ + unsigned char buf[5]; + buf[0] = 0xce; _msgpack_store32(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func_fixint(_uint64)(msgpack_pack_user x, uint64_t d) +{ + unsigned char buf[9]; + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 9); +} + +msgpack_pack_inline_func_fixint(_int8)(msgpack_pack_user x, int8_t d) +{ + unsigned char buf[2] = {0xd0, TAKE8_8(d)}; + msgpack_pack_append_buffer(x, buf, 2); +} + +msgpack_pack_inline_func_fixint(_int16)(msgpack_pack_user x, int16_t d) +{ + unsigned char buf[3]; + buf[0] = 0xd1; _msgpack_store16(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 3); +} + +msgpack_pack_inline_func_fixint(_int32)(msgpack_pack_user x, int32_t d) +{ + unsigned char buf[5]; + buf[0] = 0xd2; _msgpack_store32(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func_fixint(_int64)(msgpack_pack_user x, int64_t d) +{ + unsigned char buf[9]; + buf[0] = 0xd3; _msgpack_store64(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 9); +} + +#undef msgpack_pack_inline_func_fixint +#endif + +msgpack_pack_inline_func(_uint8)(msgpack_pack_user x, uint8_t d) +{ + msgpack_pack_real_uint8(x, d); +} + +msgpack_pack_inline_func(_uint16)(msgpack_pack_user x, uint16_t d) +{ + msgpack_pack_real_uint16(x, d); +} + +msgpack_pack_inline_func(_uint32)(msgpack_pack_user x, uint32_t d) +{ + msgpack_pack_real_uint32(x, d); +} + +msgpack_pack_inline_func(_uint64)(msgpack_pack_user x, uint64_t d) +{ + msgpack_pack_real_uint64(x, d); +} + +msgpack_pack_inline_func(_int8)(msgpack_pack_user x, int8_t d) +{ + msgpack_pack_real_int8(x, d); +} + +msgpack_pack_inline_func(_int16)(msgpack_pack_user x, int16_t d) +{ + msgpack_pack_real_int16(x, d); +} + +msgpack_pack_inline_func(_int32)(msgpack_pack_user x, int32_t d) +{ + msgpack_pack_real_int32(x, d); +} + +msgpack_pack_inline_func(_int64)(msgpack_pack_user x, int64_t d) +{ + msgpack_pack_real_int64(x, d); +} + +msgpack_pack_inline_func(_char)(msgpack_pack_user x, char d) +{ +#if defined(CHAR_MIN) +#if CHAR_MIN < 0 + msgpack_pack_real_int8(x, d); +#else + msgpack_pack_real_uint8(x, d); +#endif +#else +#error CHAR_MIN is not defined +#endif +} + +msgpack_pack_inline_func(_signed_char)(msgpack_pack_user x, signed char d) +{ + msgpack_pack_real_int8(x, d); +} + +msgpack_pack_inline_func(_unsigned_char)(msgpack_pack_user x, unsigned char d) +{ + msgpack_pack_real_uint8(x, d); +} + +#ifdef msgpack_pack_inline_func_cint + +msgpack_pack_inline_func_cint(_short)(msgpack_pack_user x, short d) +{ +#if defined(SIZEOF_SHORT) +#if SIZEOF_SHORT == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_SHORT == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(SHRT_MAX) +#if SHRT_MAX == 0x7fff + msgpack_pack_real_int16(x, d); +#elif SHRT_MAX == 0x7fffffff + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(short) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(short) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_int)(msgpack_pack_user x, int d) +{ +#if defined(SIZEOF_INT) +#if SIZEOF_INT == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_INT == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(INT_MAX) +#if INT_MAX == 0x7fff + msgpack_pack_real_int16(x, d); +#elif INT_MAX == 0x7fffffff + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(int) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(int) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_long)(msgpack_pack_user x, long d) +{ +#if defined(SIZEOF_LONG) +#if SIZEOF_LONG == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_LONG == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(LONG_MAX) +#if LONG_MAX == 0x7fffL + msgpack_pack_real_int16(x, d); +#elif LONG_MAX == 0x7fffffffL + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(long) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(long) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_long_long)(msgpack_pack_user x, long long d) +{ +#if defined(SIZEOF_LONG_LONG) +#if SIZEOF_LONG_LONG == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_LONG_LONG == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(LLONG_MAX) +#if LLONG_MAX == 0x7fffL + msgpack_pack_real_int16(x, d); +#elif LLONG_MAX == 0x7fffffff + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(long long) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(long long) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_short)(msgpack_pack_user x, unsigned short d) +{ +#if defined(SIZEOF_SHORT) +#if SIZEOF_SHORT == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_SHORT == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(USHRT_MAX) +#if USHRT_MAX == 0xffffU + msgpack_pack_real_uint16(x, d); +#elif USHRT_MAX == 0xffffffffU + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned short) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned short) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_int)(msgpack_pack_user x, unsigned int d) +{ +#if defined(SIZEOF_INT) +#if SIZEOF_INT == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_INT == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(UINT_MAX) +#if UINT_MAX == 0xffffU + msgpack_pack_real_uint16(x, d); +#elif UINT_MAX == 0xffffffffU + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned int) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned int) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_long)(msgpack_pack_user x, unsigned long d) +{ +#if defined(SIZEOF_LONG) +#if SIZEOF_LONG == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_LONG == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(ULONG_MAX) +#if ULONG_MAX == 0xffffUL + msgpack_pack_real_uint16(x, d); +#elif ULONG_MAX == 0xffffffffUL + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned long) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned long) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_long_long)(msgpack_pack_user x, unsigned long long d) +{ +#if defined(SIZEOF_LONG_LONG) +#if SIZEOF_LONG_LONG == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_LONG_LONG == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(ULLONG_MAX) +#if ULLONG_MAX == 0xffffUL + msgpack_pack_real_uint16(x, d); +#elif ULLONG_MAX == 0xffffffffUL + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned long long) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned long long) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +#undef msgpack_pack_inline_func_cint +#endif + +/* + * Float + */ + +msgpack_pack_inline_func(_float)(msgpack_pack_user x, float d) +{ + unsigned char buf[5]; + union { float f; uint32_t i; } mem; + mem.f = d; + buf[0] = 0xca; _msgpack_store32(&buf[1], mem.i); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func(_double)(msgpack_pack_user x, double d) +{ + unsigned char buf[9]; + union { double f; uint64_t i; } mem; + mem.f = d; + buf[0] = 0xcb; +#if defined(TARGET_OS_IPHONE) + /* ok */ +#elif defined(__arm__) && !(__ARM_EABI__) /* arm-oabi */ + /* https://github.com/msgpack/msgpack-perl/pull/1 */ + mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); +#endif + _msgpack_store64(&buf[1], mem.i); + msgpack_pack_append_buffer(x, buf, 9); +} + +/* + * Nil + */ + +msgpack_pack_inline_func(_nil)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc0; + msgpack_pack_append_buffer(x, &d, 1); +} + +/* + * Boolean + */ + +msgpack_pack_inline_func(_true)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc3; + msgpack_pack_append_buffer(x, &d, 1); +} + +msgpack_pack_inline_func(_false)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc2; + msgpack_pack_append_buffer(x, &d, 1); +} + +/* + * Array + */ + +msgpack_pack_inline_func(_array)(msgpack_pack_user x, size_t n) +{ + if(n < 16) { + unsigned char d = 0x90 | (uint8_t)n; + msgpack_pack_append_buffer(x, &d, 1); + } else if(n < 65536) { + unsigned char buf[3]; + buf[0] = 0xdc; _msgpack_store16(&buf[1], (uint16_t)n); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdd; _msgpack_store32(&buf[1], (uint32_t)n); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +/* + * Map + */ + +msgpack_pack_inline_func(_map)(msgpack_pack_user x, size_t n) +{ + if(n < 16) { + unsigned char d = 0x80 | (uint8_t)n; + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); + } else if(n < 65536) { + unsigned char buf[3]; + buf[0] = 0xde; _msgpack_store16(&buf[1], (uint16_t)n); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdf; _msgpack_store32(&buf[1], (uint32_t)n); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +/* + * Str + */ + +msgpack_pack_inline_func(_str)(msgpack_pack_user x, size_t l) +{ + if(l < 32) { + unsigned char d = 0xa0 | (uint8_t)l; + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); + } else if(l < 256) { + unsigned char buf[2]; + buf[0] = 0xd9; buf[1] = (uint8_t)l; + msgpack_pack_append_buffer(x, buf, 2); + } else if(l < 65536) { + unsigned char buf[3]; + buf[0] = 0xda; _msgpack_store16(&buf[1], (uint16_t)l); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdb; _msgpack_store32(&buf[1], (uint32_t)l); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +msgpack_pack_inline_func(_str_body)(msgpack_pack_user x, const void* b, size_t l) +{ + msgpack_pack_append_buffer(x, (const unsigned char*)b, l); +} + +/* + * Raw (V4) + */ + +msgpack_pack_inline_func(_v4raw)(msgpack_pack_user x, size_t l) +{ + if(l < 32) { + unsigned char d = 0xa0 | (uint8_t)l; + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); + } else if(l < 65536) { + unsigned char buf[3]; + buf[0] = 0xda; _msgpack_store16(&buf[1], (uint16_t)l); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdb; _msgpack_store32(&buf[1], (uint32_t)l); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +msgpack_pack_inline_func(_v4raw_body)(msgpack_pack_user x, const void* b, size_t l) +{ + msgpack_pack_append_buffer(x, (const unsigned char*)b, l); +} + +/* + * Bin + */ + +msgpack_pack_inline_func(_bin)(msgpack_pack_user x, size_t l) +{ + if(l < 256) { + unsigned char buf[2]; + buf[0] = 0xc4; buf[1] = (uint8_t)l; + msgpack_pack_append_buffer(x, buf, 2); + } else if(l < 65536) { + unsigned char buf[3]; + buf[0] = 0xc5; _msgpack_store16(&buf[1], (uint16_t)l); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xc6; _msgpack_store32(&buf[1], (uint32_t)l); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +msgpack_pack_inline_func(_bin_body)(msgpack_pack_user x, const void* b, size_t l) +{ + msgpack_pack_append_buffer(x, (const unsigned char*)b, l); +} + +/* + * Ext + */ + +msgpack_pack_inline_func(_ext)(msgpack_pack_user x, size_t l, int8_t type) +{ + switch(l) { + case 1: { + unsigned char buf[2]; + buf[0] = 0xd4; + buf[1] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 2); + } break; + case 2: { + unsigned char buf[2]; + buf[0] = 0xd5; + buf[1] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 2); + } break; + case 4: { + unsigned char buf[2]; + buf[0] = 0xd6; + buf[1] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 2); + } break; + case 8: { + unsigned char buf[2]; + buf[0] = 0xd7; + buf[1] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 2); + } break; + case 16: { + unsigned char buf[2]; + buf[0] = 0xd8; + buf[1] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 2); + } break; + default: + if(l < 256) { + unsigned char buf[3]; + buf[0] = 0xc7; + buf[1] = (unsigned char)l; + buf[2] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 3); + } else if(l < 65536) { + unsigned char buf[4]; + buf[0] = 0xc8; + _msgpack_store16(&buf[1], l); + buf[3] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 4); + } else { + unsigned char buf[6]; + buf[0] = 0xc9; + _msgpack_store32(&buf[1], l); + buf[5] = (unsigned char)type; + msgpack_pack_append_buffer(x, buf, 6); + } + break; + } +} + +msgpack_pack_inline_func(_ext_body)(msgpack_pack_user x, const void* b, size_t l) +{ + msgpack_pack_append_buffer(x, (const unsigned char*)b, l); +} + +msgpack_pack_inline_func(_timestamp)(msgpack_pack_user x, const msgpack_timestamp* d) +{ + if ((((int64_t)d->tv_sec) >> 34) == 0) { + uint64_t data64 = ((uint64_t) d->tv_nsec << 34) | (uint64_t)d->tv_sec; + if ((data64 & 0xffffffff00000000L) == 0) { + /* timestamp 32 */ + char buf[4]; + uint32_t data32 = (uint32_t)data64; + msgpack_pack_ext(x, 4, -1); + _msgpack_store32(buf, data32); + msgpack_pack_append_buffer(x, buf, 4); + } else { + /* timestamp 64 */ + char buf[8]; + msgpack_pack_ext(x, 8, -1); + _msgpack_store64(buf, data64); + msgpack_pack_append_buffer(x, buf, 8); + } + } else { + /* timestamp 96 */ + char buf[12]; + _msgpack_store32(&buf[0], d->tv_nsec); + _msgpack_store64(&buf[4], d->tv_sec); + msgpack_pack_ext(x, 12, -1); + msgpack_pack_append_buffer(x, buf, 12); + } +} + +#undef msgpack_pack_inline_func +#undef msgpack_pack_user +#undef msgpack_pack_append_buffer + +#undef TAKE8_8 +#undef TAKE8_16 +#undef TAKE8_32 +#undef TAKE8_64 + +#undef msgpack_pack_real_uint8 +#undef msgpack_pack_real_uint16 +#undef msgpack_pack_real_uint32 +#undef msgpack_pack_real_uint64 +#undef msgpack_pack_real_int8 +#undef msgpack_pack_real_int16 +#undef msgpack_pack_real_int32 +#undef msgpack_pack_real_int64 + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/c/third_party/msgpack/include/msgpack/sbuffer.h b/c/third_party/msgpack/include/msgpack/sbuffer.h new file mode 100644 index 0000000..572d8f2 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/sbuffer.h @@ -0,0 +1,115 @@ +/* + * MessagePack for C simple buffer implementation + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_SBUFFER_H +#define MSGPACK_SBUFFER_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_sbuffer Simple buffer + * @ingroup msgpack_buffer + * @{ + */ + +typedef struct msgpack_sbuffer { + size_t size; + char* data; + size_t alloc; +} msgpack_sbuffer; + +static inline void msgpack_sbuffer_init(msgpack_sbuffer* sbuf) +{ + memset(sbuf, 0, sizeof(msgpack_sbuffer)); +} + +static inline void msgpack_sbuffer_destroy(msgpack_sbuffer* sbuf) +{ + free(sbuf->data); +} + +static inline msgpack_sbuffer* msgpack_sbuffer_new(void) +{ + return (msgpack_sbuffer*)calloc(1, sizeof(msgpack_sbuffer)); +} + +static inline void msgpack_sbuffer_free(msgpack_sbuffer* sbuf) +{ + if(sbuf == NULL) { return; } + msgpack_sbuffer_destroy(sbuf); + free(sbuf); +} + +#ifndef MSGPACK_SBUFFER_INIT_SIZE +#define MSGPACK_SBUFFER_INIT_SIZE 8192 +#endif + +static inline int msgpack_sbuffer_write(void* data, const char* buf, size_t len) +{ + msgpack_sbuffer* sbuf = (msgpack_sbuffer*)data; + + assert(buf || len == 0); + if(!buf) return 0; + + if(sbuf->alloc - sbuf->size < len) { + void* tmp; + size_t nsize = (sbuf->alloc) ? + sbuf->alloc * 2 : MSGPACK_SBUFFER_INIT_SIZE; + + while(nsize < sbuf->size + len) { + size_t tmp_nsize = nsize * 2; + if (tmp_nsize <= nsize) { + nsize = sbuf->size + len; + break; + } + nsize = tmp_nsize; + } + + tmp = realloc(sbuf->data, nsize); + if(!tmp) { return -1; } + + sbuf->data = (char*)tmp; + sbuf->alloc = nsize; + } + + memcpy(sbuf->data + sbuf->size, buf, len); + sbuf->size += len; + + return 0; +} + +static inline char* msgpack_sbuffer_release(msgpack_sbuffer* sbuf) +{ + char* tmp = sbuf->data; + sbuf->size = 0; + sbuf->data = NULL; + sbuf->alloc = 0; + return tmp; +} + +static inline void msgpack_sbuffer_clear(msgpack_sbuffer* sbuf) +{ + sbuf->size = 0; +} + +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/sbuffer.h */ diff --git a/c/third_party/msgpack/include/msgpack/sysdep.h b/c/third_party/msgpack/include/msgpack/sysdep.h new file mode 100644 index 0000000..68392e6 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/sysdep.h @@ -0,0 +1,215 @@ +/* + * MessagePack system dependencies + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_SYSDEP_H +#define MSGPACK_SYSDEP_H + +#include +#include +#include + +#ifndef MSGPACK_ENDIAN_BIG_BYTE +#define MSGPACK_ENDIAN_BIG_BYTE 0 +#endif +#ifndef MSGPACK_ENDIAN_LITTLE_BYTE +#define MSGPACK_ENDIAN_LITTLE_BYTE 1 +#endif + +#if defined(_MSC_VER) && _MSC_VER <= 1800 +# define snprintf(buf, len, format,...) _snprintf_s(buf, len, _TRUNCATE, format, __VA_ARGS__) +#endif + +#if defined(_MSC_VER) && _MSC_VER < 1600 + typedef signed __int8 int8_t; + typedef unsigned __int8 uint8_t; + typedef signed __int16 int16_t; + typedef unsigned __int16 uint16_t; + typedef signed __int32 int32_t; + typedef unsigned __int32 uint32_t; + typedef signed __int64 int64_t; + typedef unsigned __int64 uint64_t; +# if defined(_WIN64) + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +# else + typedef signed __int32 intptr_t; + typedef unsigned __int32 uintptr_t; +# endif +#elif defined(_MSC_VER) /* _MSC_VER >= 1600 */ +# include +#else +# include +# include +#endif + +#if !defined(MSGPACK_DLLEXPORT) +#if defined(_MSC_VER) +# define MSGPACK_DLLEXPORT __declspec(dllexport) +#else /* _MSC_VER */ +# define MSGPACK_DLLEXPORT +#endif /* _MSC_VER */ +#endif + +#ifdef _WIN32 +# if defined(_KERNEL_MODE) +# define _msgpack_atomic_counter_header +# else +# define _msgpack_atomic_counter_header +# if !defined(WIN32_LEAN_AND_MEAN) +# define WIN32_LEAN_AND_MEAN +# endif /* WIN32_LEAN_AND_MEAN */ +# endif + typedef long _msgpack_atomic_counter_t; +#if defined(_AMD64_) || defined(_M_X64) || defined(_M_ARM64) +# define _msgpack_sync_decr_and_fetch(ptr) _InterlockedDecrement(ptr) +# define _msgpack_sync_incr_and_fetch(ptr) _InterlockedIncrement(ptr) +#else +# define _msgpack_sync_decr_and_fetch(ptr) InterlockedDecrement(ptr) +# define _msgpack_sync_incr_and_fetch(ptr) InterlockedIncrement(ptr) +#endif +#elif defined(__GNUC__) && ((__GNUC__*10 + __GNUC_MINOR__) < 41) + +# if defined(__cplusplus) +# define _msgpack_atomic_counter_header "msgpack/gcc_atomic.hpp" +# else +# define _msgpack_atomic_counter_header "msgpack/gcc_atomic.h" +# endif + +#else + typedef unsigned int _msgpack_atomic_counter_t; +# define _msgpack_sync_decr_and_fetch(ptr) __sync_sub_and_fetch(ptr, 1) +# define _msgpack_sync_incr_and_fetch(ptr) __sync_add_and_fetch(ptr, 1) +#endif + +#ifdef _WIN32 + +# ifdef __cplusplus + /* numeric_limits::min,max */ +# ifdef max +# undef max +# endif +# ifdef min +# undef min +# endif +# endif + +#elif defined(unix) || defined(__unix) || defined(__APPLE__) || defined(__OpenBSD__) + +#include /* __BYTE_ORDER */ +# if defined(linux) +# include +# endif + +#endif + +#if MSGPACK_ENDIAN_LITTLE_BYTE + +# if defined(unix) || defined(__unix) || defined(__APPLE__) || defined(__OpenBSD__) +# define _msgpack_be16(x) ntohs((uint16_t)x) +# else +# if defined(ntohs) +# define _msgpack_be16(x) ntohs(x) +# elif defined(_byteswap_ushort) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be16(x) ((uint16_t)_byteswap_ushort((unsigned short)x)) +# else +# define _msgpack_be16(x) ( \ + ((((uint16_t)x) << 8) ) | \ + ((((uint16_t)x) >> 8) ) ) +# endif +# endif + +# if defined(unix) || defined(__unix) || defined(__APPLE__) || defined(__OpenBSD__) +# define _msgpack_be32(x) ntohl((uint32_t)x) +# else +# if defined(ntohl) +# define _msgpack_be32(x) ntohl(x) +# elif defined(_byteswap_ulong) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be32(x) ((uint32_t)_byteswap_ulong((unsigned long)x)) +# else +# define _msgpack_be32(x) \ + ( ((((uint32_t)x) << 24) ) | \ + ((((uint32_t)x) << 8) & 0x00ff0000U ) | \ + ((((uint32_t)x) >> 8) & 0x0000ff00U ) | \ + ((((uint32_t)x) >> 24) ) ) +# endif +# endif + +# if defined(_byteswap_uint64) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be64(x) (_byteswap_uint64(x)) +# elif defined(bswap_64) +# define _msgpack_be64(x) bswap_64(x) +# elif defined(__DARWIN_OSSwapInt64) +# define _msgpack_be64(x) __DARWIN_OSSwapInt64(x) +# else +# define _msgpack_be64(x) \ + ( ((((uint64_t)x) << 56) ) | \ + ((((uint64_t)x) << 40) & 0x00ff000000000000ULL ) | \ + ((((uint64_t)x) << 24) & 0x0000ff0000000000ULL ) | \ + ((((uint64_t)x) << 8) & 0x000000ff00000000ULL ) | \ + ((((uint64_t)x) >> 8) & 0x00000000ff000000ULL ) | \ + ((((uint64_t)x) >> 24) & 0x0000000000ff0000ULL ) | \ + ((((uint64_t)x) >> 40) & 0x000000000000ff00ULL ) | \ + ((((uint64_t)x) >> 56) ) ) +# endif + +#elif MSGPACK_ENDIAN_BIG_BYTE + +# define _msgpack_be16(x) (x) +# define _msgpack_be32(x) (x) +# define _msgpack_be64(x) (x) + +#else +# error msgpack-c supports only big endian and little endian +#endif /* MSGPACK_ENDIAN_LITTLE_BYTE */ + +#define _msgpack_load16(cast, from, to) do { \ + memcpy((cast*)(to), (from), sizeof(cast)); \ + *(to) = (cast)_msgpack_be16(*(to)); \ + } while (0); + +#define _msgpack_load32(cast, from, to) do { \ + memcpy((cast*)(to), (from), sizeof(cast)); \ + *(to) = (cast)_msgpack_be32(*(to)); \ + } while (0); +#define _msgpack_load64(cast, from, to) do { \ + memcpy((cast*)(to), (from), sizeof(cast)); \ + *(to) = (cast)_msgpack_be64(*(to)); \ + } while (0); + +#define _msgpack_store16(to, num) \ + do { uint16_t val = _msgpack_be16(num); memcpy(to, &val, 2); } while(0) +#define _msgpack_store32(to, num) \ + do { uint32_t val = _msgpack_be32(num); memcpy(to, &val, 4); } while(0) +#define _msgpack_store64(to, num) \ + do { uint64_t val = _msgpack_be64(num); memcpy(to, &val, 8); } while(0) + +#if !defined(__cplusplus) && defined(_MSC_VER) +# if !defined(_KERNEL_MODE) +# if !defined(FALSE) +# define FALSE (0) +# endif +# if !defined(TRUE) +# define TRUE (!FALSE) +# endif +# endif +# if _MSC_VER >= 1800 +# include +# else +# define bool int +# define true TRUE +# define false FALSE +# endif +# define inline __inline +#endif + +#ifdef __APPLE__ +# include +#endif + +#endif /* msgpack/sysdep.h */ diff --git a/c/third_party/msgpack/include/msgpack/timestamp.h b/c/third_party/msgpack/include/msgpack/timestamp.h new file mode 100644 index 0000000..7613931 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/timestamp.h @@ -0,0 +1,58 @@ +/* + * MessagePack for C TimeStamp + * + * Copyright (C) 2018 KONDO Takatoshi + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_TIMESTAMP_H +#define MSGPACK_TIMESTAMP_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct msgpack_timestamp { + int64_t tv_sec; + uint32_t tv_nsec; +} msgpack_timestamp; + +static inline bool msgpack_object_to_timestamp(const msgpack_object* obj, msgpack_timestamp* ts) { + if (obj->type != MSGPACK_OBJECT_EXT) return false; + if (obj->via.ext.type != -1) return false; + switch (obj->via.ext.size) { + case 4: + ts->tv_nsec = 0; + { + uint32_t v; + _msgpack_load32(uint32_t, obj->via.ext.ptr, &v); + ts->tv_sec = v; + } + return true; + case 8: { + uint64_t value; + _msgpack_load64(uint64_t, obj->via.ext.ptr, &value); + ts->tv_nsec = (uint32_t)(value >> 34); + ts->tv_sec = value & 0x00000003ffffffffLL; + return true; + } + case 12: + _msgpack_load32(uint32_t, obj->via.ext.ptr, &ts->tv_nsec); + _msgpack_load64(int64_t, obj->via.ext.ptr + 4, &ts->tv_sec); + return true; + default: + return false; + } +} + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/timestamp.h */ diff --git a/c/third_party/msgpack/include/msgpack/unpack.h b/c/third_party/msgpack/include/msgpack/unpack.h new file mode 100644 index 0000000..036d575 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/unpack.h @@ -0,0 +1,281 @@ +/* + * MessagePack for C unpacking routine + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_UNPACKER_H +#define MSGPACK_UNPACKER_H + +#include "zone.h" +#include "object.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_unpack Deserializer + * @ingroup msgpack + * @{ + */ + +typedef struct msgpack_unpacked { + msgpack_zone* zone; + msgpack_object data; +} msgpack_unpacked; + +typedef enum { + MSGPACK_UNPACK_SUCCESS = 2, + MSGPACK_UNPACK_EXTRA_BYTES = 1, + MSGPACK_UNPACK_CONTINUE = 0, + MSGPACK_UNPACK_PARSE_ERROR = -1, + MSGPACK_UNPACK_NOMEM_ERROR = -2 +} msgpack_unpack_return; + + +MSGPACK_DLLEXPORT +msgpack_unpack_return +msgpack_unpack_next(msgpack_unpacked* result, + const char* data, size_t len, size_t* off); + +/** @} */ + + +/** + * @defgroup msgpack_unpacker Streaming deserializer + * @ingroup msgpack + * @{ + */ + +typedef struct msgpack_unpacker { + char* buffer; + size_t used; + size_t free; + size_t off; + size_t parsed; + msgpack_zone* z; + size_t initial_buffer_size; + void* ctx; +} msgpack_unpacker; + + +#ifndef MSGPACK_UNPACKER_INIT_BUFFER_SIZE +#define MSGPACK_UNPACKER_INIT_BUFFER_SIZE (64*1024) +#endif + +/** + * Initializes a streaming deserializer. + * The initialized deserializer must be destroyed by msgpack_unpacker_destroy(msgpack_unpacker*). + */ +MSGPACK_DLLEXPORT +bool msgpack_unpacker_init(msgpack_unpacker* mpac, size_t initial_buffer_size); + +/** + * Destroys a streaming deserializer initialized by msgpack_unpacker_init(msgpack_unpacker*, size_t). + */ +MSGPACK_DLLEXPORT +void msgpack_unpacker_destroy(msgpack_unpacker* mpac); + + +/** + * Creates a streaming deserializer. + * The created deserializer must be destroyed by msgpack_unpacker_free(msgpack_unpacker*). + */ +MSGPACK_DLLEXPORT +msgpack_unpacker* msgpack_unpacker_new(size_t initial_buffer_size); + +/** + * Frees a streaming deserializer created by msgpack_unpacker_new(size_t). + */ +MSGPACK_DLLEXPORT +void msgpack_unpacker_free(msgpack_unpacker* mpac); + + +#ifndef MSGPACK_UNPACKER_RESERVE_SIZE +#define MSGPACK_UNPACKER_RESERVE_SIZE (32*1024) +#endif + +/** + * Reserves free space of the internal buffer. + * Use this function to fill the internal buffer with + * msgpack_unpacker_buffer(msgpack_unpacker*), + * msgpack_unpacker_buffer_capacity(const msgpack_unpacker*) and + * msgpack_unpacker_buffer_consumed(msgpack_unpacker*). + */ +static inline bool msgpack_unpacker_reserve_buffer(msgpack_unpacker* mpac, size_t size); + +/** + * Gets pointer to the free space of the internal buffer. + * Use this function to fill the internal buffer with + * msgpack_unpacker_reserve_buffer(msgpack_unpacker*, size_t), + * msgpack_unpacker_buffer_capacity(const msgpack_unpacker*) and + * msgpack_unpacker_buffer_consumed(msgpack_unpacker*). + */ +static inline char* msgpack_unpacker_buffer(msgpack_unpacker* mpac); + +/** + * Gets size of the free space of the internal buffer. + * Use this function to fill the internal buffer with + * msgpack_unpacker_reserve_buffer(msgpack_unpacker*, size_t), + * msgpack_unpacker_buffer(const msgpack_unpacker*) and + * msgpack_unpacker_buffer_consumed(msgpack_unpacker*). + */ +static inline size_t msgpack_unpacker_buffer_capacity(const msgpack_unpacker* mpac); + +/** + * Notifies the deserializer that the internal buffer filled. + * Use this function to fill the internal buffer with + * msgpack_unpacker_reserve_buffer(msgpack_unpacker*, size_t), + * msgpack_unpacker_buffer(msgpack_unpacker*) and + * msgpack_unpacker_buffer_capacity(const msgpack_unpacker*). + */ +static inline void msgpack_unpacker_buffer_consumed(msgpack_unpacker* mpac, size_t size); + + +/** + * Deserializes one object. + * Returns true if it successes. Otherwise false is returned. + * @param pac pointer to an initialized msgpack_unpacked object. + */ +MSGPACK_DLLEXPORT +msgpack_unpack_return msgpack_unpacker_next(msgpack_unpacker* mpac, msgpack_unpacked* pac); + +/** + * Deserializes one object and set the number of parsed bytes involved. + * Returns true if it successes. Otherwise false is returned. + * @param mpac pointer to an initialized msgpack_unpacker object. + * @param result pointer to an initialized msgpack_unpacked object. + * @param p_bytes pointer to variable that will be set with the number of parsed bytes. + */ +MSGPACK_DLLEXPORT +msgpack_unpack_return msgpack_unpacker_next_with_size(msgpack_unpacker* mpac, + msgpack_unpacked* result, + size_t *p_bytes); + +/** + * Initializes a msgpack_unpacked object. + * The initialized object must be destroyed by msgpack_unpacked_destroy(msgpack_unpacker*). + * Use the object with msgpack_unpacker_next(msgpack_unpacker*, msgpack_unpacked*) or + * msgpack_unpack_next(msgpack_unpacked*, const char*, size_t, size_t*). + */ +static inline void msgpack_unpacked_init(msgpack_unpacked* result); + +/** + * Destroys a streaming deserializer initialized by msgpack_unpacked(). + */ +static inline void msgpack_unpacked_destroy(msgpack_unpacked* result); + +/** + * Releases the memory zone from msgpack_unpacked object. + * The released zone must be freed by msgpack_zone_free(msgpack_zone*). + */ +static inline msgpack_zone* msgpack_unpacked_release_zone(msgpack_unpacked* result); + + +MSGPACK_DLLEXPORT +int msgpack_unpacker_execute(msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +msgpack_object msgpack_unpacker_data(msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +msgpack_zone* msgpack_unpacker_release_zone(msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +void msgpack_unpacker_reset_zone(msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +void msgpack_unpacker_reset(msgpack_unpacker* mpac); + +static inline size_t msgpack_unpacker_message_size(const msgpack_unpacker* mpac); + + +/** @} */ + + +// obsolete +MSGPACK_DLLEXPORT +msgpack_unpack_return +msgpack_unpack(const char* data, size_t len, size_t* off, + msgpack_zone* result_zone, msgpack_object* result); + + + + +static inline size_t msgpack_unpacker_parsed_size(const msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +bool msgpack_unpacker_flush_zone(msgpack_unpacker* mpac); + +MSGPACK_DLLEXPORT +bool msgpack_unpacker_expand_buffer(msgpack_unpacker* mpac, size_t size); + +static inline bool msgpack_unpacker_reserve_buffer(msgpack_unpacker* mpac, size_t size) +{ + if(mpac->free >= size) { return true; } + return msgpack_unpacker_expand_buffer(mpac, size); +} + +static inline char* msgpack_unpacker_buffer(msgpack_unpacker* mpac) +{ + return mpac->buffer + mpac->used; +} + +static inline size_t msgpack_unpacker_buffer_capacity(const msgpack_unpacker* mpac) +{ + return mpac->free; +} + +static inline void msgpack_unpacker_buffer_consumed(msgpack_unpacker* mpac, size_t size) +{ + mpac->used += size; + mpac->free -= size; +} + +static inline size_t msgpack_unpacker_message_size(const msgpack_unpacker* mpac) +{ + return mpac->parsed - mpac->off + mpac->used; +} + +static inline size_t msgpack_unpacker_parsed_size(const msgpack_unpacker* mpac) +{ + return mpac->parsed; +} + + +static inline void msgpack_unpacked_init(msgpack_unpacked* result) +{ + memset(result, 0, sizeof(msgpack_unpacked)); +} + +static inline void msgpack_unpacked_destroy(msgpack_unpacked* result) +{ + if(result->zone != NULL) { + msgpack_zone_free(result->zone); + result->zone = NULL; + memset(&result->data, 0, sizeof(msgpack_object)); + } +} + +static inline msgpack_zone* msgpack_unpacked_release_zone(msgpack_unpacked* result) +{ + if(result->zone != NULL) { + msgpack_zone* z = result->zone; + result->zone = NULL; + return z; + } + return NULL; +} + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/unpack.h */ diff --git a/c/third_party/msgpack/include/msgpack/unpack_define.h b/c/third_party/msgpack/include/msgpack/unpack_define.h new file mode 100644 index 0000000..c7decf6 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/unpack_define.h @@ -0,0 +1,89 @@ +/* + * MessagePack unpacking routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_UNPACK_DEFINE_H +#define MSGPACK_UNPACK_DEFINE_H + +#include "msgpack/sysdep.h" +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifndef MSGPACK_EMBED_STACK_SIZE +#define MSGPACK_EMBED_STACK_SIZE 32 +#endif + + +typedef enum { + MSGPACK_CS_HEADER = 0x00, // nil + + //MSGPACK_CS_ = 0x01, + //MSGPACK_CS_ = 0x02, // false + //MSGPACK_CS_ = 0x03, // true + + MSGPACK_CS_BIN_8 = 0x04, + MSGPACK_CS_BIN_16 = 0x05, + MSGPACK_CS_BIN_32 = 0x06, + + MSGPACK_CS_EXT_8 = 0x07, + MSGPACK_CS_EXT_16 = 0x08, + MSGPACK_CS_EXT_32 = 0x09, + + MSGPACK_CS_FLOAT = 0x0a, + MSGPACK_CS_DOUBLE = 0x0b, + MSGPACK_CS_UINT_8 = 0x0c, + MSGPACK_CS_UINT_16 = 0x0d, + MSGPACK_CS_UINT_32 = 0x0e, + MSGPACK_CS_UINT_64 = 0x0f, + MSGPACK_CS_INT_8 = 0x10, + MSGPACK_CS_INT_16 = 0x11, + MSGPACK_CS_INT_32 = 0x12, + MSGPACK_CS_INT_64 = 0x13, + + MSGPACK_CS_FIXEXT_1 = 0x14, + MSGPACK_CS_FIXEXT_2 = 0x15, + MSGPACK_CS_FIXEXT_4 = 0x16, + MSGPACK_CS_FIXEXT_8 = 0x17, + MSGPACK_CS_FIXEXT_16 = 0x18, + + MSGPACK_CS_STR_8 = 0x19, // str8 + MSGPACK_CS_STR_16 = 0x1a, // str16 + MSGPACK_CS_STR_32 = 0x1b, // str32 + MSGPACK_CS_ARRAY_16 = 0x1c, + MSGPACK_CS_ARRAY_32 = 0x1d, + MSGPACK_CS_MAP_16 = 0x1e, + MSGPACK_CS_MAP_32 = 0x1f, + + //MSGPACK_ACS_BIG_INT_VALUE, + //MSGPACK_ACS_BIG_FLOAT_VALUE, + MSGPACK_ACS_STR_VALUE, + MSGPACK_ACS_BIN_VALUE, + MSGPACK_ACS_EXT_VALUE +} msgpack_unpack_state; + + +typedef enum { + MSGPACK_CT_ARRAY_ITEM, + MSGPACK_CT_MAP_KEY, + MSGPACK_CT_MAP_VALUE +} msgpack_container_type; + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/unpack_define.h */ + diff --git a/c/third_party/msgpack/include/msgpack/unpack_template.h b/c/third_party/msgpack/include/msgpack/unpack_template.h new file mode 100644 index 0000000..de30f3c --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/unpack_template.h @@ -0,0 +1,471 @@ +/* + * MessagePack unpacking routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ + +#ifndef msgpack_unpack_func +#error msgpack_unpack_func template is not defined +#endif + +#ifndef msgpack_unpack_callback +#error msgpack_unpack_callback template is not defined +#endif + +#ifndef msgpack_unpack_struct +#error msgpack_unpack_struct template is not defined +#endif + +#ifndef msgpack_unpack_struct_decl +#define msgpack_unpack_struct_decl(name) msgpack_unpack_struct(name) +#endif + +#ifndef msgpack_unpack_object +#error msgpack_unpack_object type is not defined +#endif + +#ifndef msgpack_unpack_user +#error msgpack_unpack_user type is not defined +#endif + +#ifndef USE_CASE_RANGE +#if !defined(_MSC_VER) +#define USE_CASE_RANGE +#endif +#endif + +#if defined(_KERNEL_MODE) +#undef assert +#define assert NT_ASSERT +#endif + +msgpack_unpack_struct_decl(_stack) { + msgpack_unpack_object obj; + size_t count; + unsigned int ct; + msgpack_unpack_object map_key; +}; + +msgpack_unpack_struct_decl(_context) { + msgpack_unpack_user user; + unsigned int cs; + unsigned int trail; + unsigned int top; + /* + msgpack_unpack_struct(_stack)* stack; + unsigned int stack_size; + msgpack_unpack_struct(_stack) embed_stack[MSGPACK_EMBED_STACK_SIZE]; + */ + msgpack_unpack_struct(_stack) stack[MSGPACK_EMBED_STACK_SIZE]; +}; + + +msgpack_unpack_func(void, _init)(msgpack_unpack_struct(_context)* ctx) +{ + ctx->cs = MSGPACK_CS_HEADER; + ctx->trail = 0; + ctx->top = 0; + /* + ctx->stack = ctx->embed_stack; + ctx->stack_size = MSGPACK_EMBED_STACK_SIZE; + */ + ctx->stack[0].obj = msgpack_unpack_callback(_root)(&ctx->user); +} + +/* +msgpack_unpack_func(void, _destroy)(msgpack_unpack_struct(_context)* ctx) +{ + if(ctx->stack_size != MSGPACK_EMBED_STACK_SIZE) { + free(ctx->stack); + } +} +*/ + +msgpack_unpack_func(msgpack_unpack_object, _data)(msgpack_unpack_struct(_context)* ctx) +{ + return (ctx)->stack[0].obj; +} + + +msgpack_unpack_func(int, _execute)(msgpack_unpack_struct(_context)* ctx, const char* data, size_t len, size_t* off) +{ + assert(len >= *off); + { + const unsigned char* p = (unsigned char*)data + *off; + const unsigned char* const pe = (unsigned char*)data + len; + const void* n = NULL; + + unsigned int trail = ctx->trail; + unsigned int cs = ctx->cs; + unsigned int top = ctx->top; + msgpack_unpack_struct(_stack)* stack = ctx->stack; + /* + unsigned int stack_size = ctx->stack_size; + */ + msgpack_unpack_user* user = &ctx->user; + + msgpack_unpack_object obj; + msgpack_unpack_struct(_stack)* c = NULL; + + int ret; + +#define push_simple_value(func) \ + ret = msgpack_unpack_callback(func)(user, &obj); \ + if(ret < 0) { goto _failed; } \ + goto _push +#define push_fixed_value(func, arg) \ + ret = msgpack_unpack_callback(func)(user, arg, &obj); \ + if(ret < 0) { goto _failed; } \ + goto _push +#define push_variable_value(func, base, pos, len) \ + ret = msgpack_unpack_callback(func)(user, \ + (const char*)base, (const char*)pos, len, &obj); \ + if(ret < 0) { goto _failed; } \ + goto _push + +#define again_fixed_trail(_cs, trail_len) \ + trail = trail_len; \ + cs = _cs; \ + goto _fixed_trail_again +#define again_fixed_trail_if_zero(_cs, trail_len, ifzero) \ + trail = trail_len; \ + if(trail == 0) { goto ifzero; } \ + cs = _cs; \ + goto _fixed_trail_again + +#define start_container(func, count_, ct_) \ + if(top >= MSGPACK_EMBED_STACK_SIZE) { \ + ret = MSGPACK_UNPACK_NOMEM_ERROR; \ + goto _failed; \ + } /* FIXME */ \ + ret = msgpack_unpack_callback(func)(user, count_, &stack[top].obj); \ + if(ret < 0) { goto _failed; } \ + if((count_) == 0) { obj = stack[top].obj; goto _push; } \ + stack[top].ct = ct_; \ + stack[top].count = count_; \ + ++top; \ + goto _header_again + +#define NEXT_CS(p) \ + ((unsigned int)*p & 0x1f) + +#ifdef USE_CASE_RANGE +#define SWITCH_RANGE_BEGIN switch(*p) { +#define SWITCH_RANGE(FROM, TO) case FROM ... TO: +#define SWITCH_RANGE_DEFAULT default: +#define SWITCH_RANGE_END } +#else +#define SWITCH_RANGE_BEGIN { if(0) { +#define SWITCH_RANGE(FROM, TO) } else if(FROM <= *p && *p <= TO) { +#define SWITCH_RANGE_DEFAULT } else { +#define SWITCH_RANGE_END } } +#endif + + if(p == pe) { goto _out; } + do { + switch(cs) { + case MSGPACK_CS_HEADER: + SWITCH_RANGE_BEGIN + SWITCH_RANGE(0x00, 0x7f) // Positive Fixnum + push_fixed_value(_uint8, *(uint8_t*)p); + SWITCH_RANGE(0xe0, 0xff) // Negative Fixnum + push_fixed_value(_int8, *(int8_t*)p); + SWITCH_RANGE(0xc0, 0xdf) // Variable + switch(*p) { + case 0xc0: // nil + push_simple_value(_nil); + //case 0xc1: // string + // again_terminal_trail(NEXT_CS(p), p+1); + case 0xc2: // false + push_simple_value(_false); + case 0xc3: // true + push_simple_value(_true); + case 0xc4: // bin 8 + case 0xc5: // bin 16 + case 0xc6: // bin 32 + again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03)); + case 0xc7: // ext 8 + case 0xc8: // ext 16 + case 0xc9: // ext 32 + again_fixed_trail(NEXT_CS(p), 1 << ((((unsigned int)*p) + 1) & 0x03)); + case 0xca: // float + case 0xcb: // double + case 0xcc: // unsigned int 8 + case 0xcd: // unsigned int 16 + case 0xce: // unsigned int 32 + case 0xcf: // unsigned int 64 + case 0xd0: // signed int 8 + case 0xd1: // signed int 16 + case 0xd2: // signed int 32 + case 0xd3: // signed int 64 + again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03)); + case 0xd4: // fixext 1 + case 0xd5: // fixext 2 + case 0xd6: // fixext 4 + case 0xd7: // fixext 8 + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, + (1 << (((unsigned int)*p) & 0x03)) + 1, _ext_zero); + case 0xd8: // fixext 16 + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 16+1, _ext_zero); + + case 0xd9: // str 8 + case 0xda: // str 16 + case 0xdb: // str 32 + again_fixed_trail(NEXT_CS(p), 1 << ((((unsigned int)*p) & 0x03) - 1)); + case 0xdc: // array 16 + case 0xdd: // array 32 + case 0xde: // map 16 + case 0xdf: // map 32 + again_fixed_trail(NEXT_CS(p), 2u << (((unsigned int)*p) & 0x01)); + default: + ret = MSGPACK_UNPACK_PARSE_ERROR; + goto _failed; + } + SWITCH_RANGE(0xa0, 0xbf) // FixStr + again_fixed_trail_if_zero(MSGPACK_ACS_STR_VALUE, ((unsigned int)*p & 0x1f), _str_zero); + SWITCH_RANGE(0x90, 0x9f) // FixArray + start_container(_array, ((unsigned int)*p) & 0x0f, MSGPACK_CT_ARRAY_ITEM); + SWITCH_RANGE(0x80, 0x8f) // FixMap + start_container(_map, ((unsigned int)*p) & 0x0f, MSGPACK_CT_MAP_KEY); + + SWITCH_RANGE_DEFAULT + ret = MSGPACK_UNPACK_PARSE_ERROR; + goto _failed; + SWITCH_RANGE_END + // end MSGPACK_CS_HEADER + + + _fixed_trail_again: + ++p; + // fallthrough + + default: + if((size_t)(pe - p) < trail) { goto _out; } + n = p; p += trail - 1; + switch(cs) { + //case MSGPACK_CS_ + //case MSGPACK_CS_ + case MSGPACK_CS_FLOAT: { + union { uint32_t i; float f; } mem; + _msgpack_load32(uint32_t, n, &mem.i); + push_fixed_value(_float, mem.f); } + case MSGPACK_CS_DOUBLE: { + union { uint64_t i; double f; } mem; + _msgpack_load64(uint64_t, n, &mem.i); +#if defined(TARGET_OS_IPHONE) + // ok +#elif defined(__arm__) && !(__ARM_EABI__) // arm-oabi + // https://github.com/msgpack/msgpack-perl/pull/1 + mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); +#endif + push_fixed_value(_double, mem.f); } + case MSGPACK_CS_UINT_8: + push_fixed_value(_uint8, *(uint8_t*)n); + case MSGPACK_CS_UINT_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + push_fixed_value(_uint16, tmp); + } + case MSGPACK_CS_UINT_32:{ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + push_fixed_value(_uint32, tmp); + } + case MSGPACK_CS_UINT_64:{ + uint64_t tmp; + _msgpack_load64(uint64_t,n,&tmp); + push_fixed_value(_uint64, tmp); + } + case MSGPACK_CS_INT_8: + push_fixed_value(_int8, *(int8_t*)n); + case MSGPACK_CS_INT_16:{ + int16_t tmp; + _msgpack_load16(int16_t,n,&tmp); + push_fixed_value(_int16, tmp); + } + case MSGPACK_CS_INT_32:{ + int32_t tmp; + _msgpack_load32(int32_t,n,&tmp); + push_fixed_value(_int32, tmp); + } + case MSGPACK_CS_INT_64:{ + int64_t tmp; + _msgpack_load64(int64_t,n,&tmp); + push_fixed_value(_int64, tmp); + } + case MSGPACK_CS_FIXEXT_1: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 1+1, _ext_zero); + case MSGPACK_CS_FIXEXT_2: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 2+1, _ext_zero); + case MSGPACK_CS_FIXEXT_4: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 4+1, _ext_zero); + case MSGPACK_CS_FIXEXT_8: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 8+1, _ext_zero); + case MSGPACK_CS_FIXEXT_16: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, 16+1, _ext_zero); + case MSGPACK_CS_STR_8: + again_fixed_trail_if_zero(MSGPACK_ACS_STR_VALUE, *(uint8_t*)n, _str_zero); + case MSGPACK_CS_BIN_8: + again_fixed_trail_if_zero(MSGPACK_ACS_BIN_VALUE, *(uint8_t*)n, _bin_zero); + case MSGPACK_CS_EXT_8: + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, (*(uint8_t*)n) + 1, _ext_zero); + case MSGPACK_CS_STR_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_STR_VALUE, tmp, _str_zero); + } + case MSGPACK_CS_BIN_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_BIN_VALUE, tmp, _bin_zero); + } + case MSGPACK_CS_EXT_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, tmp + 1, _ext_zero); + } + case MSGPACK_CS_STR_32:{ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_STR_VALUE, tmp, _str_zero); + } + case MSGPACK_CS_BIN_32:{ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_BIN_VALUE, tmp, _bin_zero); + } + case MSGPACK_CS_EXT_32:{ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + again_fixed_trail_if_zero(MSGPACK_ACS_EXT_VALUE, tmp + 1, _ext_zero); + } + case MSGPACK_ACS_STR_VALUE: + _str_zero: + push_variable_value(_str, data, n, trail); + case MSGPACK_ACS_BIN_VALUE: + _bin_zero: + push_variable_value(_bin, data, n, trail); + case MSGPACK_ACS_EXT_VALUE: + _ext_zero: + push_variable_value(_ext, data, n, trail); + + case MSGPACK_CS_ARRAY_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + start_container(_array, tmp, MSGPACK_CT_ARRAY_ITEM); + } + case MSGPACK_CS_ARRAY_32:{ + /* FIXME security guard */ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + start_container(_array, tmp, MSGPACK_CT_ARRAY_ITEM); + } + + case MSGPACK_CS_MAP_16:{ + uint16_t tmp; + _msgpack_load16(uint16_t,n,&tmp); + start_container(_map, tmp, MSGPACK_CT_MAP_KEY); + } + case MSGPACK_CS_MAP_32:{ + /* FIXME security guard */ + uint32_t tmp; + _msgpack_load32(uint32_t,n,&tmp); + start_container(_map, tmp, MSGPACK_CT_MAP_KEY); + } + + default: + ret = MSGPACK_UNPACK_PARSE_ERROR; + goto _failed; + } + } + + _push: + if(top == 0) { goto _finish; } + c = &stack[top-1]; + switch(c->ct) { + case MSGPACK_CT_ARRAY_ITEM: + ret = msgpack_unpack_callback(_array_item)(user, &c->obj, obj); \ + if(ret < 0) { goto _failed; } + if(--c->count == 0) { + obj = c->obj; + --top; + /*printf("stack pop %d\n", top);*/ + goto _push; + } + goto _header_again; + case MSGPACK_CT_MAP_KEY: + c->map_key = obj; + c->ct = MSGPACK_CT_MAP_VALUE; + goto _header_again; + case MSGPACK_CT_MAP_VALUE: + ret = msgpack_unpack_callback(_map_item)(user, &c->obj, c->map_key, obj); \ + if(ret < 0) { goto _failed; } + if(--c->count == 0) { + obj = c->obj; + --top; + /*printf("stack pop %d\n", top);*/ + goto _push; + } + c->ct = MSGPACK_CT_MAP_KEY; + goto _header_again; + + default: + ret = MSGPACK_UNPACK_PARSE_ERROR; + goto _failed; + } + + _header_again: + cs = MSGPACK_CS_HEADER; + ++p; + } while(p != pe); + goto _out; + + + _finish: + stack[0].obj = obj; + ++p; + ret = 1; + /*printf("-- finish --\n"); */ + goto _end; + + _failed: + /*printf("** FAILED **\n"); */ + goto _end; + + _out: + ret = 0; + goto _end; + + _end: + ctx->cs = cs; + ctx->trail = trail; + ctx->top = top; + *off = (size_t)(p - (const unsigned char*)data); + + return ret; + } +} + +#undef msgpack_unpack_func +#undef msgpack_unpack_callback +#undef msgpack_unpack_struct +#undef msgpack_unpack_object +#undef msgpack_unpack_user + +#undef push_simple_value +#undef push_fixed_value +#undef push_variable_value +#undef again_fixed_trail +#undef again_fixed_trail_if_zero +#undef start_container + +#undef NEXT_CS + +#undef SWITCH_RANGE_BEGIN +#undef SWITCH_RANGE +#undef SWITCH_RANGE_DEFAULT +#undef SWITCH_RANGE_END diff --git a/c/third_party/msgpack/include/msgpack/util.h b/c/third_party/msgpack/include/msgpack/util.h new file mode 100644 index 0000000..959b56b --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/util.h @@ -0,0 +1,15 @@ +/* + * MessagePack for C utilities + * + * Copyright (C) 2014 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_UTIL_H +#define MSGPACK_UTIL_H + +#define MSGPACK_UNUSED(a) (void)(a) + +#endif /* MSGPACK_UTIL_H */ diff --git a/c/third_party/msgpack/include/msgpack/version.h b/c/third_party/msgpack/include/msgpack/version.h new file mode 100644 index 0000000..bd6605b --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/version.h @@ -0,0 +1,38 @@ +/* + * MessagePack for C version information + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_VERSION_H +#define MSGPACK_VERSION_H + +#ifdef __cplusplus +extern "C" { +#endif + +MSGPACK_DLLEXPORT +const char* msgpack_version(void); +MSGPACK_DLLEXPORT +int msgpack_version_major(void); +MSGPACK_DLLEXPORT +int msgpack_version_minor(void); +MSGPACK_DLLEXPORT +int msgpack_version_revision(void); + +#include "version_master.h" + +#define MSGPACK_STR(v) #v +#define MSGPACK_VERSION_I(maj, min, rev) MSGPACK_STR(maj) "." MSGPACK_STR(min) "." MSGPACK_STR(rev) + +#define MSGPACK_VERSION MSGPACK_VERSION_I(MSGPACK_VERSION_MAJOR, MSGPACK_VERSION_MINOR, MSGPACK_VERSION_REVISION) + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/version.h */ + diff --git a/c/third_party/msgpack/include/msgpack/version_master.h b/c/third_party/msgpack/include/msgpack/version_master.h new file mode 100644 index 0000000..ad76f73 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/version_master.h @@ -0,0 +1,3 @@ +#define MSGPACK_VERSION_MAJOR 6 +#define MSGPACK_VERSION_MINOR 1 +#define MSGPACK_VERSION_REVISION 0 diff --git a/c/third_party/msgpack/include/msgpack/vrefbuffer.h b/c/third_party/msgpack/include/msgpack/vrefbuffer.h new file mode 100644 index 0000000..c263305 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/vrefbuffer.h @@ -0,0 +1,146 @@ +/* + * MessagePack for C zero-copy buffer implementation + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_VREFBUFFER_H +#define MSGPACK_VREFBUFFER_H + +#include "zone.h" +#include +#include + +#if defined(unix) || defined(__unix) || defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__QNX__) || defined(__QNXTO__) || defined(__HAIKU__) +#include +typedef struct iovec msgpack_iovec; +#else +struct msgpack_iovec { + void *iov_base; + size_t iov_len; +}; +typedef struct msgpack_iovec msgpack_iovec; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_vrefbuffer Vectored Referencing buffer + * @ingroup msgpack_buffer + * @{ + */ + +struct msgpack_vrefbuffer_chunk; +typedef struct msgpack_vrefbuffer_chunk msgpack_vrefbuffer_chunk; + +typedef struct msgpack_vrefbuffer_inner_buffer { + size_t free; + char* ptr; + msgpack_vrefbuffer_chunk* head; +} msgpack_vrefbuffer_inner_buffer; + +typedef struct msgpack_vrefbuffer { + msgpack_iovec* tail; + msgpack_iovec* end; + msgpack_iovec* array; + + size_t chunk_size; + size_t ref_size; + + msgpack_vrefbuffer_inner_buffer inner_buffer; +} msgpack_vrefbuffer; + + +#ifndef MSGPACK_VREFBUFFER_REF_SIZE +#define MSGPACK_VREFBUFFER_REF_SIZE 32 +#endif + +#ifndef MSGPACK_VREFBUFFER_CHUNK_SIZE +#define MSGPACK_VREFBUFFER_CHUNK_SIZE 8192 +#endif + +MSGPACK_DLLEXPORT +bool msgpack_vrefbuffer_init(msgpack_vrefbuffer* vbuf, + size_t ref_size, size_t chunk_size); +MSGPACK_DLLEXPORT +void msgpack_vrefbuffer_destroy(msgpack_vrefbuffer* vbuf); + +static inline msgpack_vrefbuffer* msgpack_vrefbuffer_new(size_t ref_size, size_t chunk_size); +static inline void msgpack_vrefbuffer_free(msgpack_vrefbuffer* vbuf); + +static inline int msgpack_vrefbuffer_write(void* data, const char* buf, size_t len); + +static inline const msgpack_iovec* msgpack_vrefbuffer_vec(const msgpack_vrefbuffer* vref); +static inline size_t msgpack_vrefbuffer_veclen(const msgpack_vrefbuffer* vref); + +MSGPACK_DLLEXPORT +int msgpack_vrefbuffer_append_copy(msgpack_vrefbuffer* vbuf, + const char* buf, size_t len); + +MSGPACK_DLLEXPORT +int msgpack_vrefbuffer_append_ref(msgpack_vrefbuffer* vbuf, + const char* buf, size_t len); + +MSGPACK_DLLEXPORT +int msgpack_vrefbuffer_migrate(msgpack_vrefbuffer* vbuf, msgpack_vrefbuffer* to); + +MSGPACK_DLLEXPORT +void msgpack_vrefbuffer_clear(msgpack_vrefbuffer* vref); + +/** @} */ + + +static inline msgpack_vrefbuffer* msgpack_vrefbuffer_new(size_t ref_size, size_t chunk_size) +{ + msgpack_vrefbuffer* vbuf = (msgpack_vrefbuffer*)malloc(sizeof(msgpack_vrefbuffer)); + if (vbuf == NULL) return NULL; + if(!msgpack_vrefbuffer_init(vbuf, ref_size, chunk_size)) { + free(vbuf); + return NULL; + } + return vbuf; +} + +static inline void msgpack_vrefbuffer_free(msgpack_vrefbuffer* vbuf) +{ + if(vbuf == NULL) { return; } + msgpack_vrefbuffer_destroy(vbuf); + free(vbuf); +} + +static inline int msgpack_vrefbuffer_write(void* data, const char* buf, size_t len) +{ + msgpack_vrefbuffer* vbuf = (msgpack_vrefbuffer*)data; + assert(buf || len == 0); + + if(!buf) return 0; + + if(len < vbuf->ref_size) { + return msgpack_vrefbuffer_append_copy(vbuf, buf, len); + } else { + return msgpack_vrefbuffer_append_ref(vbuf, buf, len); + } +} + +static inline const msgpack_iovec* msgpack_vrefbuffer_vec(const msgpack_vrefbuffer* vref) +{ + return vref->array; +} + +static inline size_t msgpack_vrefbuffer_veclen(const msgpack_vrefbuffer* vref) +{ + return (size_t)(vref->tail - vref->array); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/vrefbuffer.h */ diff --git a/c/third_party/msgpack/include/msgpack/zbuffer.h b/c/third_party/msgpack/include/msgpack/zbuffer.h new file mode 100644 index 0000000..c38d627 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/zbuffer.h @@ -0,0 +1,205 @@ +/* + * MessagePack for C deflate buffer implementation + * + * Copyright (C) 2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_ZBUFFER_H +#define MSGPACK_ZBUFFER_H + +#include "sysdep.h" +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_zbuffer Compressed buffer + * @ingroup msgpack_buffer + * @{ + */ + +typedef struct msgpack_zbuffer { + z_stream stream; + char* data; + size_t init_size; +} msgpack_zbuffer; + +#ifndef MSGPACK_ZBUFFER_INIT_SIZE +#define MSGPACK_ZBUFFER_INIT_SIZE 8192 +#endif + +static inline bool msgpack_zbuffer_init( + msgpack_zbuffer* zbuf, int level, size_t init_size); +static inline void msgpack_zbuffer_destroy(msgpack_zbuffer* zbuf); + +static inline msgpack_zbuffer* msgpack_zbuffer_new(int level, size_t init_size); +static inline void msgpack_zbuffer_free(msgpack_zbuffer* zbuf); + +static inline char* msgpack_zbuffer_flush(msgpack_zbuffer* zbuf); + +static inline const char* msgpack_zbuffer_data(const msgpack_zbuffer* zbuf); +static inline size_t msgpack_zbuffer_size(const msgpack_zbuffer* zbuf); + +static inline bool msgpack_zbuffer_reset(msgpack_zbuffer* zbuf); +static inline void msgpack_zbuffer_reset_buffer(msgpack_zbuffer* zbuf); +static inline char* msgpack_zbuffer_release_buffer(msgpack_zbuffer* zbuf); + + +#ifndef MSGPACK_ZBUFFER_RESERVE_SIZE +#define MSGPACK_ZBUFFER_RESERVE_SIZE 512 +#endif + +static inline int msgpack_zbuffer_write(void* data, const char* buf, size_t len); + +static inline bool msgpack_zbuffer_expand(msgpack_zbuffer* zbuf); + + +static inline bool msgpack_zbuffer_init(msgpack_zbuffer* zbuf, + int level, size_t init_size) +{ + memset(zbuf, 0, sizeof(msgpack_zbuffer)); + zbuf->init_size = init_size; + if(deflateInit(&zbuf->stream, level) != Z_OK) { + free(zbuf->data); + return false; + } + return true; +} + +static inline void msgpack_zbuffer_destroy(msgpack_zbuffer* zbuf) +{ + deflateEnd(&zbuf->stream); + free(zbuf->data); +} + +static inline msgpack_zbuffer* msgpack_zbuffer_new(int level, size_t init_size) +{ + msgpack_zbuffer* zbuf = (msgpack_zbuffer*)malloc(sizeof(msgpack_zbuffer)); + if (zbuf == NULL) return NULL; + if(!msgpack_zbuffer_init(zbuf, level, init_size)) { + free(zbuf); + return NULL; + } + return zbuf; +} + +static inline void msgpack_zbuffer_free(msgpack_zbuffer* zbuf) +{ + if(zbuf == NULL) { return; } + msgpack_zbuffer_destroy(zbuf); + free(zbuf); +} + +static inline bool msgpack_zbuffer_expand(msgpack_zbuffer* zbuf) +{ + size_t used = (size_t)((char *)(zbuf->stream.next_out) - zbuf->data); + size_t csize = used + zbuf->stream.avail_out; + + size_t nsize = (csize == 0) ? zbuf->init_size : csize * 2; + + char* tmp = (char*)realloc(zbuf->data, nsize); + if(tmp == NULL) { + return false; + } + + zbuf->data = tmp; + zbuf->stream.next_out = (Bytef*)(tmp + used); + zbuf->stream.avail_out = (uInt)(nsize - used); + + return true; +} + +static inline int msgpack_zbuffer_write(void* data, const char* buf, size_t len) +{ + msgpack_zbuffer* zbuf = (msgpack_zbuffer*)data; + + assert(buf || len == 0); + if(!buf) return 0; + + zbuf->stream.next_in = (Bytef*)buf; + zbuf->stream.avail_in = (uInt)len; + + while(zbuf->stream.avail_in > 0) { + if(zbuf->stream.avail_out < MSGPACK_ZBUFFER_RESERVE_SIZE) { + if(!msgpack_zbuffer_expand(zbuf)) { + return -1; + } + } + + if(deflate(&zbuf->stream, Z_NO_FLUSH) != Z_OK) { + return -1; + } + } + + return 0; +} + +static inline char* msgpack_zbuffer_flush(msgpack_zbuffer* zbuf) +{ + while(true) { + switch(deflate(&zbuf->stream, Z_FINISH)) { + case Z_STREAM_END: + return zbuf->data; + case Z_OK: + case Z_BUF_ERROR: + if(!msgpack_zbuffer_expand(zbuf)) { + return NULL; + } + break; + default: + return NULL; + } + } +} + +static inline const char* msgpack_zbuffer_data(const msgpack_zbuffer* zbuf) +{ + return zbuf->data; +} + +static inline size_t msgpack_zbuffer_size(const msgpack_zbuffer* zbuf) +{ + return (size_t)((char *)(zbuf->stream.next_out) - zbuf->data); +} + +static inline void msgpack_zbuffer_reset_buffer(msgpack_zbuffer* zbuf) +{ + zbuf->stream.avail_out += (uInt)((char*)zbuf->stream.next_out - zbuf->data); + zbuf->stream.next_out = (Bytef*)zbuf->data; +} + +static inline bool msgpack_zbuffer_reset(msgpack_zbuffer* zbuf) +{ + if(deflateReset(&zbuf->stream) != Z_OK) { + return false; + } + msgpack_zbuffer_reset_buffer(zbuf); + return true; +} + +static inline char* msgpack_zbuffer_release_buffer(msgpack_zbuffer* zbuf) +{ + char* tmp = zbuf->data; + zbuf->data = NULL; + zbuf->stream.next_out = NULL; + zbuf->stream.avail_out = 0; + return tmp; +} + +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/zbuffer.h */ diff --git a/c/third_party/msgpack/include/msgpack/zone.h b/c/third_party/msgpack/include/msgpack/zone.h new file mode 100644 index 0000000..7facd54 --- /dev/null +++ b/c/third_party/msgpack/include/msgpack/zone.h @@ -0,0 +1,163 @@ +/* + * MessagePack for C memory pool implementation + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef MSGPACK_ZONE_H +#define MSGPACK_ZONE_H + +#include "sysdep.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @defgroup msgpack_zone Memory zone + * @ingroup msgpack + * @{ + */ + +typedef struct msgpack_zone_finalizer { + void (*func)(void* data); + void* data; +} msgpack_zone_finalizer; + +typedef struct msgpack_zone_finalizer_array { + msgpack_zone_finalizer* tail; + msgpack_zone_finalizer* end; + msgpack_zone_finalizer* array; +} msgpack_zone_finalizer_array; + +struct msgpack_zone_chunk; +typedef struct msgpack_zone_chunk msgpack_zone_chunk; + +typedef struct msgpack_zone_chunk_list { + size_t free; + char* ptr; + msgpack_zone_chunk* head; +} msgpack_zone_chunk_list; + +typedef struct msgpack_zone { + msgpack_zone_chunk_list chunk_list; + msgpack_zone_finalizer_array finalizer_array; + size_t chunk_size; +} msgpack_zone; + +#ifndef MSGPACK_ZONE_CHUNK_SIZE +#define MSGPACK_ZONE_CHUNK_SIZE 8192 +#endif + +MSGPACK_DLLEXPORT +bool msgpack_zone_init(msgpack_zone* zone, size_t chunk_size); +MSGPACK_DLLEXPORT +void msgpack_zone_destroy(msgpack_zone* zone); + +MSGPACK_DLLEXPORT +msgpack_zone* msgpack_zone_new(size_t chunk_size); +MSGPACK_DLLEXPORT +void msgpack_zone_free(msgpack_zone* zone); + +static inline void* msgpack_zone_malloc(msgpack_zone* zone, size_t size); +static inline void* msgpack_zone_malloc_no_align(msgpack_zone* zone, size_t size); + +static inline bool msgpack_zone_push_finalizer(msgpack_zone* zone, + void (*func)(void* data), void* data); + +static inline void msgpack_zone_swap(msgpack_zone* a, msgpack_zone* b); + +MSGPACK_DLLEXPORT +bool msgpack_zone_is_empty(msgpack_zone* zone); + +MSGPACK_DLLEXPORT +void msgpack_zone_clear(msgpack_zone* zone); + +/** @} */ + + +#ifndef MSGPACK_ZONE_ALIGN +#define MSGPACK_ZONE_ALIGN sizeof(void*) +#endif + +MSGPACK_DLLEXPORT +void* msgpack_zone_malloc_expand(msgpack_zone* zone, size_t size); + +static inline void* msgpack_zone_malloc_no_align(msgpack_zone* zone, size_t size) +{ + char* ptr; + msgpack_zone_chunk_list* cl = &zone->chunk_list; + + if(zone->chunk_list.free < size) { + return msgpack_zone_malloc_expand(zone, size); + } + + ptr = cl->ptr; + cl->free -= size; + cl->ptr += size; + + return ptr; +} + +static inline void* msgpack_zone_malloc(msgpack_zone* zone, size_t size) +{ + char* aligned = + (char*)( + (uintptr_t)( + zone->chunk_list.ptr + (MSGPACK_ZONE_ALIGN - 1) + ) & ~(uintptr_t)(MSGPACK_ZONE_ALIGN - 1) + ); + size_t adjusted_size = size + (size_t)(aligned - zone->chunk_list.ptr); + if(zone->chunk_list.free >= adjusted_size) { + zone->chunk_list.free -= adjusted_size; + zone->chunk_list.ptr += adjusted_size; + return aligned; + } + { + void* ptr = msgpack_zone_malloc_expand(zone, size + (MSGPACK_ZONE_ALIGN - 1)); + if (ptr) { + return (char*)((uintptr_t)(ptr) & ~(uintptr_t)(MSGPACK_ZONE_ALIGN - 1)); + } + } + return NULL; +} + + +bool msgpack_zone_push_finalizer_expand(msgpack_zone* zone, + void (*func)(void* data), void* data); + +static inline bool msgpack_zone_push_finalizer(msgpack_zone* zone, + void (*func)(void* data), void* data) +{ + msgpack_zone_finalizer_array* const fa = &zone->finalizer_array; + msgpack_zone_finalizer* fin = fa->tail; + + if(fin == fa->end) { + return msgpack_zone_push_finalizer_expand(zone, func, data); + } + + fin->func = func; + fin->data = data; + + ++fa->tail; + + return true; +} + +static inline void msgpack_zone_swap(msgpack_zone* a, msgpack_zone* b) +{ + msgpack_zone tmp = *a; + *a = *b; + *b = tmp; +} + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/zone.h */ diff --git a/c/third_party/msgpack/src/objectc.c b/c/third_party/msgpack/src/objectc.c new file mode 100644 index 0000000..c33ab2b --- /dev/null +++ b/c/third_party/msgpack/src/objectc.c @@ -0,0 +1,550 @@ +/* + * MessagePack for C dynamic typing routine + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#if defined(_KERNEL_MODE) +# undef _NO_CRT_STDIO_INLINE +# define _NO_CRT_STDIO_INLINE +#endif + +#include "msgpack/object.h" +#include "msgpack/pack.h" +#include + +#include +#include + +#if defined(_MSC_VER) +#if _MSC_VER >= 1800 +#include +#else +#define PRIu64 "I64u" +#define PRIi64 "I64i" +#define PRIi8 "i" +#endif +#else +#include +#endif + +#if defined(_KERNEL_MODE) +# undef snprintf +# define snprintf _snprintf +#endif + +int msgpack_pack_object(msgpack_packer* pk, msgpack_object d) +{ + switch(d.type) { + case MSGPACK_OBJECT_NIL: + return msgpack_pack_nil(pk); + + case MSGPACK_OBJECT_BOOLEAN: + if(d.via.boolean) { + return msgpack_pack_true(pk); + } else { + return msgpack_pack_false(pk); + } + + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return msgpack_pack_uint64(pk, d.via.u64); + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return msgpack_pack_int64(pk, d.via.i64); + + case MSGPACK_OBJECT_FLOAT32: + return msgpack_pack_float(pk, (float)d.via.f64); + + case MSGPACK_OBJECT_FLOAT64: + return msgpack_pack_double(pk, d.via.f64); + + case MSGPACK_OBJECT_STR: + { + int ret = msgpack_pack_str(pk, d.via.str.size); + if(ret < 0) { return ret; } + return msgpack_pack_str_body(pk, d.via.str.ptr, d.via.str.size); + } + + case MSGPACK_OBJECT_BIN: + { + int ret = msgpack_pack_bin(pk, d.via.bin.size); + if(ret < 0) { return ret; } + return msgpack_pack_bin_body(pk, d.via.bin.ptr, d.via.bin.size); + } + + case MSGPACK_OBJECT_EXT: + { + int ret = msgpack_pack_ext(pk, d.via.ext.size, d.via.ext.type); + if(ret < 0) { return ret; } + return msgpack_pack_ext_body(pk, d.via.ext.ptr, d.via.ext.size); + } + + case MSGPACK_OBJECT_ARRAY: + { + int ret = msgpack_pack_array(pk, d.via.array.size); + if(ret < 0) { + return ret; + } + else { + msgpack_object* o = d.via.array.ptr; + msgpack_object* const oend = d.via.array.ptr + d.via.array.size; + for(; o != oend; ++o) { + ret = msgpack_pack_object(pk, *o); + if(ret < 0) { return ret; } + } + + return 0; + } + } + + case MSGPACK_OBJECT_MAP: + { + int ret = msgpack_pack_map(pk, d.via.map.size); + if(ret < 0) { + return ret; + } + else { + msgpack_object_kv* kv = d.via.map.ptr; + msgpack_object_kv* const kvend = d.via.map.ptr + d.via.map.size; + for(; kv != kvend; ++kv) { + ret = msgpack_pack_object(pk, kv->key); + if(ret < 0) { return ret; } + ret = msgpack_pack_object(pk, kv->val); + if(ret < 0) { return ret; } + } + + return 0; + } + } + + default: + return -1; + } +} + +void msgpack_object_init_nil(msgpack_object* d) { + d->type = MSGPACK_OBJECT_NIL; +} + +void msgpack_object_init_boolean(msgpack_object* d, bool v) { + d->type = MSGPACK_OBJECT_BOOLEAN; + d->via.boolean = v; +} + +void msgpack_object_init_unsigned_integer(msgpack_object* d, uint64_t v) { + d->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + d->via.u64 = v; +} + +void msgpack_object_init_signed_integer(msgpack_object* d, int64_t v) { + if (v < 0) { + d->type = MSGPACK_OBJECT_NEGATIVE_INTEGER; + d->via.i64 = v; + } + else { + d->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + d->via.u64 = v; + } +} + +void msgpack_object_init_float32(msgpack_object* d, float v) { + d->type = MSGPACK_OBJECT_FLOAT32; + d->via.f64 = v; +} + +void msgpack_object_init_float64(msgpack_object* d, double v) { + d->type = MSGPACK_OBJECT_FLOAT64; + d->via.f64 = v; +} + +void msgpack_object_init_str(msgpack_object* d, const char* data, uint32_t size) { + d->type = MSGPACK_OBJECT_STR; + d->via.str.ptr = data; + d->via.str.size = size; +} + +void msgpack_object_init_bin(msgpack_object* d, const char* data, uint32_t size) { + d->type = MSGPACK_OBJECT_BIN; + d->via.bin.ptr = data; + d->via.bin.size = size; +} + +void msgpack_object_init_ext(msgpack_object* d, int8_t type, const char* data, uint32_t size) { + d->type = MSGPACK_OBJECT_EXT; + d->via.ext.type = type; + d->via.ext.ptr = data; + d->via.ext.size = size; +} + +void msgpack_object_init_array(msgpack_object* d, msgpack_object* data, uint32_t size) { + d->type = MSGPACK_OBJECT_ARRAY; + d->via.array.ptr = data; + d->via.array.size = size; +} + +void msgpack_object_init_map(msgpack_object* d, msgpack_object_kv* data, uint32_t size) { + d->type = MSGPACK_OBJECT_MAP; + d->via.map.ptr = data; + d->via.map.size = size; +} + +#if !defined(_KERNEL_MODE) + +static void msgpack_object_bin_print(FILE* out, const char *ptr, size_t size) +{ + size_t i; + for (i = 0; i < size; ++i) { + if (ptr[i] == '"') { + fputs("\\\"", out); + } else if (isprint((unsigned char)ptr[i])) { + fputc(ptr[i], out); + } else { + fprintf(out, "\\x%02x", (unsigned char)ptr[i]); + } + } +} + +void msgpack_object_print(FILE* out, msgpack_object o) +{ + switch(o.type) { + case MSGPACK_OBJECT_NIL: + fprintf(out, "nil"); + break; + + case MSGPACK_OBJECT_BOOLEAN: + fprintf(out, (o.via.boolean ? "true" : "false")); + break; + + case MSGPACK_OBJECT_POSITIVE_INTEGER: +#if defined(PRIu64) + fprintf(out, "%" PRIu64, o.via.u64); +#else + if (o.via.u64 > ULONG_MAX) + fprintf(out, "over 4294967295"); + else + fprintf(out, "%lu", (unsigned long)o.via.u64); +#endif + break; + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: +#if defined(PRIi64) + fprintf(out, "%" PRIi64, o.via.i64); +#else + if (o.via.i64 > LONG_MAX) + fprintf(out, "over +2147483647"); + else if (o.via.i64 < LONG_MIN) + fprintf(out, "under -2147483648"); + else + fprintf(out, "%ld", (signed long)o.via.i64); +#endif + break; + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + fprintf(out, "%f", o.via.f64); + break; + + case MSGPACK_OBJECT_STR: + fprintf(out, "\""); + fwrite(o.via.str.ptr, o.via.str.size, 1, out); + fprintf(out, "\""); + break; + + case MSGPACK_OBJECT_BIN: + fprintf(out, "\""); + msgpack_object_bin_print(out, o.via.bin.ptr, o.via.bin.size); + fprintf(out, "\""); + break; + + case MSGPACK_OBJECT_EXT: +#if defined(PRIi8) + fprintf(out, "(ext: %" PRIi8 ")", o.via.ext.type); +#else + fprintf(out, "(ext: %d)", (int)o.via.ext.type); +#endif + fprintf(out, "\""); + msgpack_object_bin_print(out, o.via.ext.ptr, o.via.ext.size); + fprintf(out, "\""); + break; + + case MSGPACK_OBJECT_ARRAY: + fprintf(out, "["); + if(o.via.array.size != 0) { + msgpack_object* p = o.via.array.ptr; + msgpack_object* const pend = o.via.array.ptr + o.via.array.size; + msgpack_object_print(out, *p); + ++p; + for(; p < pend; ++p) { + fprintf(out, ", "); + msgpack_object_print(out, *p); + } + } + fprintf(out, "]"); + break; + + case MSGPACK_OBJECT_MAP: + fprintf(out, "{"); + if(o.via.map.size != 0) { + msgpack_object_kv* p = o.via.map.ptr; + msgpack_object_kv* const pend = o.via.map.ptr + o.via.map.size; + msgpack_object_print(out, p->key); + fprintf(out, "=>"); + msgpack_object_print(out, p->val); + ++p; + for(; p < pend; ++p) { + fprintf(out, ", "); + msgpack_object_print(out, p->key); + fprintf(out, "=>"); + msgpack_object_print(out, p->val); + } + } + fprintf(out, "}"); + break; + + default: + // FIXME +#if defined(PRIu64) + fprintf(out, "#", o.type, o.via.u64); +#else + if (o.via.u64 > ULONG_MAX) + fprintf(out, "#", o.type); + else + fprintf(out, "#", o.type, (unsigned long)o.via.u64); +#endif + + } +} + +#endif + +#define MSGPACK_CHECKED_CALL(ret, func, aux_buffer, aux_buffer_size, ...) \ + ret = func(aux_buffer, aux_buffer_size, __VA_ARGS__); \ + if (ret <= 0 || ret >= (int)aux_buffer_size) return 0; \ + aux_buffer = aux_buffer + ret; \ + aux_buffer_size = aux_buffer_size - ret \ + +static int msgpack_object_bin_print_buffer(char *buffer, size_t buffer_size, const char *ptr, size_t size) +{ + size_t i; + char *aux_buffer = buffer; + size_t aux_buffer_size = buffer_size; + int ret; + + for (i = 0; i < size; ++i) { + if (ptr[i] == '"') { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\\\""); + } else if (isprint((unsigned char)ptr[i])) { + if (aux_buffer_size > 0) { + memcpy(aux_buffer, ptr + i, 1); + aux_buffer = aux_buffer + 1; + aux_buffer_size = aux_buffer_size - 1; + } + } else { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\\x%02x", (unsigned char)ptr[i]); + } + } + + return (int)(buffer_size - aux_buffer_size); +} + +int msgpack_object_print_buffer(char *buffer, size_t buffer_size, msgpack_object o) +{ + char *aux_buffer = buffer; + size_t aux_buffer_size = buffer_size; + int ret; + switch(o.type) { + case MSGPACK_OBJECT_NIL: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "nil"); + break; + + case MSGPACK_OBJECT_BOOLEAN: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, (o.via.boolean ? "true" : "false")); + break; + + case MSGPACK_OBJECT_POSITIVE_INTEGER: +#if defined(PRIu64) + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%" PRIu64, o.via.u64); +#else + if (o.via.u64 > ULONG_MAX) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "over 4294967295"); + } else { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%lu", (unsigned long)o.via.u64); + } +#endif + break; + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: +#if defined(PRIi64) + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%" PRIi64, o.via.i64); +#else + if (o.via.i64 > LONG_MAX) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "over +2147483647"); + } else if (o.via.i64 < LONG_MIN) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "under -2147483648"); + } else { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%ld", (signed long)o.via.i64); + } +#endif + break; + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%f", o.via.f64); + break; + + case MSGPACK_OBJECT_STR: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + if (o.via.str.size > 0) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "%.*s", (int)o.via.str.size, o.via.str.ptr); + } + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + break; + + case MSGPACK_OBJECT_BIN: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + MSGPACK_CHECKED_CALL(ret, msgpack_object_bin_print_buffer, aux_buffer, aux_buffer_size, o.via.bin.ptr, o.via.bin.size); + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + break; + + case MSGPACK_OBJECT_EXT: +#if defined(PRIi8) + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "(ext: %" PRIi8 ")", o.via.ext.type); +#else + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "(ext: %d)", (int)o.via.ext.type); +#endif + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + MSGPACK_CHECKED_CALL(ret, msgpack_object_bin_print_buffer, aux_buffer, aux_buffer_size, o.via.ext.ptr, o.via.ext.size); + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "\""); + break; + + case MSGPACK_OBJECT_ARRAY: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "["); + if(o.via.array.size != 0) { + msgpack_object* p = o.via.array.ptr; + msgpack_object* const pend = o.via.array.ptr + o.via.array.size; + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, *p); + ++p; + for(; p < pend; ++p) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, ", "); + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, *p); + } + } + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "]"); + break; + + case MSGPACK_OBJECT_MAP: + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "{"); + if(o.via.map.size != 0) { + msgpack_object_kv* p = o.via.map.ptr; + msgpack_object_kv* const pend = o.via.map.ptr + o.via.map.size; + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, p->key); + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "=>"); + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, p->val); + ++p; + for(; p < pend; ++p) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, ", "); + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, p->key); + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "=>"); + MSGPACK_CHECKED_CALL(ret, msgpack_object_print_buffer, aux_buffer, aux_buffer_size, p->val); + } + } + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "}"); + break; + + default: + // FIXME +#if defined(PRIu64) + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "#", o.type, o.via.u64); +#else + if (o.via.u64 > ULONG_MAX) { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "#", o.type); + } else { + MSGPACK_CHECKED_CALL(ret, snprintf, aux_buffer, aux_buffer_size, "#", o.type, (unsigned long)o.via.u64); + } +#endif + } + + return (int)(buffer_size - aux_buffer_size); +} + +#undef MSGPACK_CHECKED_CALL + +bool msgpack_object_equal(const msgpack_object x, const msgpack_object y) +{ + if(x.type != y.type) { return false; } + + switch(x.type) { + case MSGPACK_OBJECT_NIL: + return true; + + case MSGPACK_OBJECT_BOOLEAN: + return x.via.boolean == y.via.boolean; + + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return x.via.u64 == y.via.u64; + + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return x.via.i64 == y.via.i64; + + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return x.via.f64 == y.via.f64; + + case MSGPACK_OBJECT_STR: + return x.via.str.size == y.via.str.size && + memcmp(x.via.str.ptr, y.via.str.ptr, x.via.str.size) == 0; + + case MSGPACK_OBJECT_BIN: + return x.via.bin.size == y.via.bin.size && + memcmp(x.via.bin.ptr, y.via.bin.ptr, x.via.bin.size) == 0; + + case MSGPACK_OBJECT_EXT: + return x.via.ext.size == y.via.ext.size && + x.via.ext.type == y.via.ext.type && + memcmp(x.via.ext.ptr, y.via.ext.ptr, x.via.ext.size) == 0; + + case MSGPACK_OBJECT_ARRAY: + if(x.via.array.size != y.via.array.size) { + return false; + } else if(x.via.array.size == 0) { + return true; + } else { + msgpack_object* px = x.via.array.ptr; + msgpack_object* const pxend = x.via.array.ptr + x.via.array.size; + msgpack_object* py = y.via.array.ptr; + do { + if(!msgpack_object_equal(*px, *py)) { + return false; + } + ++px; + ++py; + } while(px < pxend); + return true; + } + + case MSGPACK_OBJECT_MAP: + if(x.via.map.size != y.via.map.size) { + return false; + } else if(x.via.map.size == 0) { + return true; + } else { + msgpack_object_kv* px = x.via.map.ptr; + msgpack_object_kv* const pxend = x.via.map.ptr + x.via.map.size; + msgpack_object_kv* py = y.via.map.ptr; + do { + if(!msgpack_object_equal(px->key, py->key) || !msgpack_object_equal(px->val, py->val)) { + return false; + } + ++px; + ++py; + } while(px < pxend); + return true; + } + + default: + return false; + } +} diff --git a/c/third_party/msgpack/src/unpack.c b/c/third_party/msgpack/src/unpack.c new file mode 100644 index 0000000..9341cb0 --- /dev/null +++ b/c/third_party/msgpack/src/unpack.c @@ -0,0 +1,702 @@ +/* + * MessagePack for C unpacking routine + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include "msgpack/unpack.h" +#include "msgpack/unpack_define.h" +#include "msgpack/util.h" +#include + +#ifdef _msgpack_atomic_counter_header +#include _msgpack_atomic_counter_header +#endif + + +typedef struct { + msgpack_zone** z; + bool referenced; +} unpack_user; + + +#define msgpack_unpack_struct(name) \ + struct template ## name + +#define msgpack_unpack_func(ret, name) \ + ret template ## name + +#define msgpack_unpack_callback(name) \ + template_callback ## name + +#define msgpack_unpack_object msgpack_object + +#define msgpack_unpack_user unpack_user + + +struct template_context; +typedef struct template_context template_context; + +static void template_init(template_context* ctx); + +static msgpack_object template_data(template_context* ctx); + +static int template_execute( + template_context* ctx, const char* data, size_t len, size_t* off); + + +static inline msgpack_object template_callback_root(unpack_user* u) +{ + msgpack_object o; + MSGPACK_UNUSED(u); + o.type = MSGPACK_OBJECT_NIL; + return o; +} + +static inline int template_callback_uint8(unpack_user* u, uint8_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = d; + return 0; +} + +static inline int template_callback_uint16(unpack_user* u, uint16_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = d; + return 0; +} + +static inline int template_callback_uint32(unpack_user* u, uint32_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = d; + return 0; +} + +static inline int template_callback_uint64(unpack_user* u, uint64_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = d; + return 0; +} + +static inline int template_callback_int8(unpack_user* u, int8_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + if(d >= 0) { + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = (uint64_t)d; + return 0; + } + else { + o->type = MSGPACK_OBJECT_NEGATIVE_INTEGER; + o->via.i64 = d; + return 0; + } +} + +static inline int template_callback_int16(unpack_user* u, int16_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + if(d >= 0) { + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = (uint64_t)d; + return 0; + } + else { + o->type = MSGPACK_OBJECT_NEGATIVE_INTEGER; + o->via.i64 = d; + return 0; + } +} + +static inline int template_callback_int32(unpack_user* u, int32_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + if(d >= 0) { + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = (uint64_t)d; + return 0; + } + else { + o->type = MSGPACK_OBJECT_NEGATIVE_INTEGER; + o->via.i64 = d; + return 0; + } +} + +static inline int template_callback_int64(unpack_user* u, int64_t d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + if(d >= 0) { + o->type = MSGPACK_OBJECT_POSITIVE_INTEGER; + o->via.u64 = (uint64_t)d; + return 0; + } + else { + o->type = MSGPACK_OBJECT_NEGATIVE_INTEGER; + o->via.i64 = d; + return 0; + } +} + +static inline int template_callback_float(unpack_user* u, float d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_FLOAT32; + o->via.f64 = d; + return 0; +} + +static inline int template_callback_double(unpack_user* u, double d, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_FLOAT64; + o->via.f64 = d; + return 0; +} + +static inline int template_callback_nil(unpack_user* u, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_NIL; + return 0; +} + +static inline int template_callback_true(unpack_user* u, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_BOOLEAN; + o->via.boolean = true; + return 0; +} + +static inline int template_callback_false(unpack_user* u, msgpack_object* o) +{ + MSGPACK_UNUSED(u); + o->type = MSGPACK_OBJECT_BOOLEAN; + o->via.boolean = false; + return 0; +} + +static inline int template_callback_array(unpack_user* u, unsigned int n, msgpack_object* o) +{ + size_t size; + // Let's leverage the fact that sizeof(msgpack_object) is a compile time constant + // to check for int overflows. + // Note - while n is constrained to 32-bit, the product of n * sizeof(msgpack_object) + // might not be constrained to 4GB on 64-bit systems +#if SIZE_MAX == UINT_MAX + if (n > SIZE_MAX/sizeof(msgpack_object)) + return MSGPACK_UNPACK_NOMEM_ERROR; +#endif + + o->type = MSGPACK_OBJECT_ARRAY; + o->via.array.size = 0; + + size = n * sizeof(msgpack_object); + + if (*u->z == NULL) { + *u->z = msgpack_zone_new(MSGPACK_ZONE_CHUNK_SIZE); + if(*u->z == NULL) { + return MSGPACK_UNPACK_NOMEM_ERROR; + } + } + + // Unsure whether size = 0 should be an error, and if so, what to return + o->via.array.ptr = (msgpack_object*)msgpack_zone_malloc(*u->z, size); + if(o->via.array.ptr == NULL) { return MSGPACK_UNPACK_NOMEM_ERROR; } + return 0; +} + +static inline int template_callback_array_item(unpack_user* u, msgpack_object* c, msgpack_object o) +{ + MSGPACK_UNUSED(u); +#if defined(__GNUC__) && !defined(__clang__) + memcpy(&c->via.array.ptr[c->via.array.size], &o, sizeof(msgpack_object)); +#else /* __GNUC__ && !__clang__ */ + c->via.array.ptr[c->via.array.size] = o; +#endif /* __GNUC__ && !__clang__ */ + ++c->via.array.size; + return 0; +} + +static inline int template_callback_map(unpack_user* u, unsigned int n, msgpack_object* o) +{ + size_t size; + // Let's leverage the fact that sizeof(msgpack_object_kv) is a compile time constant + // to check for int overflows + // Note - while n is constrained to 32-bit, the product of n * sizeof(msgpack_object) + // might not be constrained to 4GB on 64-bit systems + + // Note - this will always be false on 64-bit systems +#if SIZE_MAX == UINT_MAX + if (n > SIZE_MAX/sizeof(msgpack_object_kv)) + return MSGPACK_UNPACK_NOMEM_ERROR; +#endif + + o->type = MSGPACK_OBJECT_MAP; + o->via.map.size = 0; + + size = n * sizeof(msgpack_object_kv); + + if (*u->z == NULL) { + *u->z = msgpack_zone_new(MSGPACK_ZONE_CHUNK_SIZE); + if(*u->z == NULL) { + return MSGPACK_UNPACK_NOMEM_ERROR; + } + } + + // Should size = 0 be an error? If so, what error to return? + o->via.map.ptr = (msgpack_object_kv*)msgpack_zone_malloc(*u->z, size); + if(o->via.map.ptr == NULL) { return MSGPACK_UNPACK_NOMEM_ERROR; } + return 0; +} + +static inline int template_callback_map_item(unpack_user* u, msgpack_object* c, msgpack_object k, msgpack_object v) +{ + MSGPACK_UNUSED(u); +#if defined(__GNUC__) && !defined(__clang__) + memcpy(&c->via.map.ptr[c->via.map.size].key, &k, sizeof(msgpack_object)); + memcpy(&c->via.map.ptr[c->via.map.size].val, &v, sizeof(msgpack_object)); +#else /* __GNUC__ && !__clang__ */ + c->via.map.ptr[c->via.map.size].key = k; + c->via.map.ptr[c->via.map.size].val = v; +#endif /* __GNUC__ && !__clang__ */ + ++c->via.map.size; + return 0; +} + +static inline int template_callback_str(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_object* o) +{ + MSGPACK_UNUSED(b); + if (*u->z == NULL) { + *u->z = msgpack_zone_new(MSGPACK_ZONE_CHUNK_SIZE); + if(*u->z == NULL) { + return MSGPACK_UNPACK_NOMEM_ERROR; + } + } + o->type = MSGPACK_OBJECT_STR; + o->via.str.ptr = p; + o->via.str.size = l; + u->referenced = true; + return 0; +} + +static inline int template_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_object* o) +{ + MSGPACK_UNUSED(b); + if (*u->z == NULL) { + *u->z = msgpack_zone_new(MSGPACK_ZONE_CHUNK_SIZE); + if(*u->z == NULL) { + return MSGPACK_UNPACK_NOMEM_ERROR; + } + } + o->type = MSGPACK_OBJECT_BIN; + o->via.bin.ptr = p; + o->via.bin.size = l; + u->referenced = true; + return 0; +} + +static inline int template_callback_ext(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_object* o) +{ + MSGPACK_UNUSED(b); + if (l == 0) { + return MSGPACK_UNPACK_PARSE_ERROR; + } + if (*u->z == NULL) { + *u->z = msgpack_zone_new(MSGPACK_ZONE_CHUNK_SIZE); + if(*u->z == NULL) { + return MSGPACK_UNPACK_NOMEM_ERROR; + } + } + o->type = MSGPACK_OBJECT_EXT; + o->via.ext.type = *p; + o->via.ext.ptr = p + 1; + o->via.ext.size = l - 1; + u->referenced = true; + return 0; +} + +#include "msgpack/unpack_template.h" + + +#define CTX_CAST(m) ((template_context*)(m)) +#define CTX_REFERENCED(mpac) CTX_CAST((mpac)->ctx)->user.referenced + +#define COUNTER_SIZE (sizeof(_msgpack_atomic_counter_t)) + + +static inline void init_count(void* buffer) +{ + *(volatile _msgpack_atomic_counter_t*)buffer = 1; +} + +static inline void decr_count(void* buffer) +{ + // atomic if(--*(_msgpack_atomic_counter_t*)buffer == 0) { free(buffer); } + if(_msgpack_sync_decr_and_fetch((volatile _msgpack_atomic_counter_t*)buffer) == 0) { + free(buffer); + } +} + +static inline void incr_count(void* buffer) +{ + // atomic ++*(_msgpack_atomic_counter_t*)buffer; + _msgpack_sync_incr_and_fetch((volatile _msgpack_atomic_counter_t*)buffer); +} + +static inline _msgpack_atomic_counter_t get_count(void* buffer) +{ + return *(volatile _msgpack_atomic_counter_t*)buffer; +} + +bool msgpack_unpacker_init(msgpack_unpacker* mpac, size_t initial_buffer_size) +{ + char* buffer; + void* ctx; + + if(initial_buffer_size < COUNTER_SIZE) { + initial_buffer_size = COUNTER_SIZE; + } + + buffer = (char*)malloc(initial_buffer_size); + if(buffer == NULL) { + return false; + } + + ctx = malloc(sizeof(template_context)); + if(ctx == NULL) { + free(buffer); + return false; + } + + mpac->buffer = buffer; + mpac->used = COUNTER_SIZE; + mpac->free = initial_buffer_size - mpac->used; + mpac->off = COUNTER_SIZE; + mpac->parsed = 0; + mpac->initial_buffer_size = initial_buffer_size; + mpac->z = NULL; + mpac->ctx = ctx; + + init_count(mpac->buffer); + + template_init(CTX_CAST(mpac->ctx)); + CTX_CAST(mpac->ctx)->user.z = &mpac->z; + CTX_CAST(mpac->ctx)->user.referenced = false; + + return true; +} + +void msgpack_unpacker_destroy(msgpack_unpacker* mpac) +{ + msgpack_zone_free(mpac->z); + free(mpac->ctx); + decr_count(mpac->buffer); +} + +msgpack_unpacker* msgpack_unpacker_new(size_t initial_buffer_size) +{ + msgpack_unpacker* mpac = (msgpack_unpacker*)malloc(sizeof(msgpack_unpacker)); + if(mpac == NULL) { + return NULL; + } + + if(!msgpack_unpacker_init(mpac, initial_buffer_size)) { + free(mpac); + return NULL; + } + + return mpac; +} + +void msgpack_unpacker_free(msgpack_unpacker* mpac) +{ + msgpack_unpacker_destroy(mpac); + free(mpac); +} + +bool msgpack_unpacker_expand_buffer(msgpack_unpacker* mpac, size_t size) +{ + if(mpac->used == mpac->off && get_count(mpac->buffer) == 1 + && !CTX_REFERENCED(mpac)) { + // rewind buffer + mpac->free += mpac->used - COUNTER_SIZE; + mpac->used = COUNTER_SIZE; + mpac->off = COUNTER_SIZE; + + if(mpac->free >= size) { + return true; + } + } + + if(mpac->off == COUNTER_SIZE) { + char* tmp; + size_t next_size = (mpac->used + mpac->free) * 2; // include COUNTER_SIZE + while(next_size < size + mpac->used) { + size_t tmp_next_size = next_size * 2; + if (tmp_next_size <= next_size) { + next_size = size + mpac->used; + break; + } + next_size = tmp_next_size; + } + + tmp = (char*)realloc(mpac->buffer, next_size); + if(tmp == NULL) { + return false; + } + + mpac->buffer = tmp; + mpac->free = next_size - mpac->used; + + } else { + char* tmp; + size_t next_size = mpac->initial_buffer_size; // include COUNTER_SIZE + size_t not_parsed = mpac->used - mpac->off; + while(next_size < size + not_parsed + COUNTER_SIZE) { + size_t tmp_next_size = next_size * 2; + if (tmp_next_size <= next_size) { + next_size = size + not_parsed + COUNTER_SIZE; + break; + } + next_size = tmp_next_size; + } + + tmp = (char*)malloc(next_size); + if(tmp == NULL) { + return false; + } + + init_count(tmp); + + memcpy(tmp+COUNTER_SIZE, mpac->buffer+mpac->off, not_parsed); + + if(CTX_REFERENCED(mpac)) { + if(!msgpack_zone_push_finalizer(mpac->z, decr_count, mpac->buffer)) { + free(tmp); + return false; + } + CTX_REFERENCED(mpac) = false; + } else { + decr_count(mpac->buffer); + } + + mpac->buffer = tmp; + mpac->used = not_parsed + COUNTER_SIZE; + mpac->free = next_size - mpac->used; + mpac->off = COUNTER_SIZE; + } + + return true; +} + +int msgpack_unpacker_execute(msgpack_unpacker* mpac) +{ + size_t off = mpac->off; + int ret = template_execute(CTX_CAST(mpac->ctx), + mpac->buffer, mpac->used, &mpac->off); + if(mpac->off > off) { + mpac->parsed += mpac->off - off; + } + return ret; +} + +msgpack_object msgpack_unpacker_data(msgpack_unpacker* mpac) +{ + return template_data(CTX_CAST(mpac->ctx)); +} + +msgpack_zone* msgpack_unpacker_release_zone(msgpack_unpacker* mpac) +{ + msgpack_zone* old = mpac->z; + + if (old == NULL) return NULL; + if(!msgpack_unpacker_flush_zone(mpac)) { + return NULL; + } + + mpac->z = NULL; + CTX_CAST(mpac->ctx)->user.z = &mpac->z; + + return old; +} + +void msgpack_unpacker_reset_zone(msgpack_unpacker* mpac) +{ + msgpack_zone_clear(mpac->z); +} + +bool msgpack_unpacker_flush_zone(msgpack_unpacker* mpac) +{ + if(CTX_REFERENCED(mpac)) { + if(!msgpack_zone_push_finalizer(mpac->z, decr_count, mpac->buffer)) { + return false; + } + CTX_REFERENCED(mpac) = false; + + incr_count(mpac->buffer); + } + + return true; +} + +void msgpack_unpacker_reset(msgpack_unpacker* mpac) +{ + template_init(CTX_CAST(mpac->ctx)); + // don't reset referenced flag + mpac->parsed = 0; +} + +static inline msgpack_unpack_return unpacker_next(msgpack_unpacker* mpac, + msgpack_unpacked* result) +{ + int ret; + + msgpack_unpacked_destroy(result); + + ret = msgpack_unpacker_execute(mpac); + + if(ret < 0) { + result->zone = NULL; + memset(&result->data, 0, sizeof(msgpack_object)); + return (msgpack_unpack_return)ret; + } + + if(ret == 0) { + return MSGPACK_UNPACK_CONTINUE; + } + result->zone = msgpack_unpacker_release_zone(mpac); + result->data = msgpack_unpacker_data(mpac); + + return MSGPACK_UNPACK_SUCCESS; +} + +msgpack_unpack_return msgpack_unpacker_next(msgpack_unpacker* mpac, + msgpack_unpacked* result) +{ + msgpack_unpack_return ret; + + ret = unpacker_next(mpac, result); + if (ret == MSGPACK_UNPACK_SUCCESS) { + msgpack_unpacker_reset(mpac); + } + + return ret; +} + +msgpack_unpack_return +msgpack_unpacker_next_with_size(msgpack_unpacker* mpac, + msgpack_unpacked* result, size_t *p_bytes) +{ + msgpack_unpack_return ret; + + ret = unpacker_next(mpac, result); + if (ret == MSGPACK_UNPACK_SUCCESS || ret == MSGPACK_UNPACK_CONTINUE) { + *p_bytes = mpac->parsed; + } + + if (ret == MSGPACK_UNPACK_SUCCESS) { + msgpack_unpacker_reset(mpac); + } + + return ret; +} + +msgpack_unpack_return +msgpack_unpack(const char* data, size_t len, size_t* off, + msgpack_zone* result_zone, msgpack_object* result) +{ + size_t noff = 0; + if(off != NULL) { noff = *off; } + + if(len <= noff) { + // FIXME + return MSGPACK_UNPACK_CONTINUE; + } + else { + int e; + template_context ctx; + template_init(&ctx); + + ctx.user.z = &result_zone; + ctx.user.referenced = false; + + e = template_execute(&ctx, data, len, &noff); + if(e < 0) { + return (msgpack_unpack_return)e; + } + + if(off != NULL) { *off = noff; } + + if(e == 0) { + return MSGPACK_UNPACK_CONTINUE; + } + + *result = template_data(&ctx); + + if(noff < len) { + return MSGPACK_UNPACK_EXTRA_BYTES; + } + + return MSGPACK_UNPACK_SUCCESS; + } +} + +msgpack_unpack_return +msgpack_unpack_next(msgpack_unpacked* result, + const char* data, size_t len, size_t* off) +{ + size_t noff = 0; + msgpack_unpacked_destroy(result); + + if(off != NULL) { noff = *off; } + + if(len <= noff) { + return MSGPACK_UNPACK_CONTINUE; + } + + { + int e; + template_context ctx; + template_init(&ctx); + + ctx.user.z = &result->zone; + ctx.user.referenced = false; + + e = template_execute(&ctx, data, len, &noff); + + if(off != NULL) { *off = noff; } + + if(e < 0) { + msgpack_zone_free(result->zone); + result->zone = NULL; + return (msgpack_unpack_return)e; + } + + if(e == 0) { + return MSGPACK_UNPACK_CONTINUE; + } + + result->data = template_data(&ctx); + + return MSGPACK_UNPACK_SUCCESS; + } +} + +#if defined(MSGPACK_OLD_COMPILER_BUS_ERROR_WORKAROUND) +// FIXME: Dirty hack to avoid a bus error caused by OS X's old gcc. +static void dummy_function_to_avoid_bus_error() +{ +} +#endif diff --git a/c/third_party/msgpack/src/version.c b/c/third_party/msgpack/src/version.c new file mode 100644 index 0000000..83f7510 --- /dev/null +++ b/c/third_party/msgpack/src/version.c @@ -0,0 +1,22 @@ +#include "msgpack.h" + +const char* msgpack_version(void) +{ + return MSGPACK_VERSION; +} + +int msgpack_version_major(void) +{ + return MSGPACK_VERSION_MAJOR; +} + +int msgpack_version_minor(void) +{ + return MSGPACK_VERSION_MINOR; +} + +int msgpack_version_revision(void) +{ + return MSGPACK_VERSION_REVISION; +} + diff --git a/c/third_party/msgpack/src/vrefbuffer.c b/c/third_party/msgpack/src/vrefbuffer.c new file mode 100644 index 0000000..d14b719 --- /dev/null +++ b/c/third_party/msgpack/src/vrefbuffer.c @@ -0,0 +1,250 @@ +/* + * MessagePack for C zero-copy buffer implementation + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include "msgpack/vrefbuffer.h" +#include +#include + +#define MSGPACK_PACKER_MAX_BUFFER_SIZE 9 + +struct msgpack_vrefbuffer_chunk { + struct msgpack_vrefbuffer_chunk* next; + /* data ... */ +}; + +bool msgpack_vrefbuffer_init(msgpack_vrefbuffer* vbuf, + size_t ref_size, size_t chunk_size) +{ + size_t nfirst; + msgpack_iovec* array; + msgpack_vrefbuffer_chunk* chunk; + + if (ref_size == 0) { + ref_size = MSGPACK_VREFBUFFER_REF_SIZE; + } + if(chunk_size == 0) { + chunk_size = MSGPACK_VREFBUFFER_CHUNK_SIZE; + } + vbuf->chunk_size = chunk_size; + vbuf->ref_size = + ref_size > MSGPACK_PACKER_MAX_BUFFER_SIZE + 1 ? + ref_size : MSGPACK_PACKER_MAX_BUFFER_SIZE + 1 ; + + if((sizeof(msgpack_vrefbuffer_chunk) + chunk_size) < chunk_size) { + return false; + } + + nfirst = (sizeof(msgpack_iovec) < 72/2) ? + 72 / sizeof(msgpack_iovec) : 8; + + array = (msgpack_iovec*)malloc( + sizeof(msgpack_iovec) * nfirst); + if(array == NULL) { + return false; + } + + vbuf->tail = array; + vbuf->end = array + nfirst; + vbuf->array = array; + + chunk = (msgpack_vrefbuffer_chunk*)malloc( + sizeof(msgpack_vrefbuffer_chunk) + chunk_size); + if(chunk == NULL) { + free(array); + return false; + } + else { + msgpack_vrefbuffer_inner_buffer* const ib = &vbuf->inner_buffer; + + ib->free = chunk_size; + ib->ptr = ((char*)chunk) + sizeof(msgpack_vrefbuffer_chunk); + ib->head = chunk; + chunk->next = NULL; + + return true; + } +} + +void msgpack_vrefbuffer_destroy(msgpack_vrefbuffer* vbuf) +{ + msgpack_vrefbuffer_chunk* c = vbuf->inner_buffer.head; + while(true) { + msgpack_vrefbuffer_chunk* n = c->next; + free(c); + if(n != NULL) { + c = n; + } else { + break; + } + } + free(vbuf->array); +} + +void msgpack_vrefbuffer_clear(msgpack_vrefbuffer* vbuf) +{ + msgpack_vrefbuffer_chunk* c = vbuf->inner_buffer.head->next; + msgpack_vrefbuffer_chunk* n; + while(c != NULL) { + n = c->next; + free(c); + c = n; + } + + { + msgpack_vrefbuffer_inner_buffer* const ib = &vbuf->inner_buffer; + msgpack_vrefbuffer_chunk* chunk = ib->head; + chunk->next = NULL; + ib->free = vbuf->chunk_size; + ib->ptr = ((char*)chunk) + sizeof(msgpack_vrefbuffer_chunk); + + vbuf->tail = vbuf->array; + } +} + +int msgpack_vrefbuffer_append_ref(msgpack_vrefbuffer* vbuf, + const char* buf, size_t len) +{ + if(vbuf->tail == vbuf->end) { + const size_t nused = (size_t)(vbuf->tail - vbuf->array); + const size_t nnext = nused * 2; + + msgpack_iovec* nvec = (msgpack_iovec*)realloc( + vbuf->array, sizeof(msgpack_iovec)*nnext); + if(nvec == NULL) { + return -1; + } + + vbuf->array = nvec; + vbuf->end = nvec + nnext; + vbuf->tail = nvec + nused; + } + + vbuf->tail->iov_base = (char*)buf; + vbuf->tail->iov_len = len; + ++vbuf->tail; + + return 0; +} + +int msgpack_vrefbuffer_append_copy(msgpack_vrefbuffer* vbuf, + const char* buf, size_t len) +{ + msgpack_vrefbuffer_inner_buffer* const ib = &vbuf->inner_buffer; + char* m; + + if(ib->free < len) { + msgpack_vrefbuffer_chunk* chunk; + size_t sz = vbuf->chunk_size; + if(sz < len) { + sz = len; + } + + if((sizeof(msgpack_vrefbuffer_chunk) + sz) < sz){ + return -1; + } + chunk = (msgpack_vrefbuffer_chunk*)malloc( + sizeof(msgpack_vrefbuffer_chunk) + sz); + if(chunk == NULL) { + return -1; + } + + chunk->next = ib->head; + ib->head = chunk; + ib->free = sz; + ib->ptr = ((char*)chunk) + sizeof(msgpack_vrefbuffer_chunk); + } + + m = ib->ptr; + memcpy(m, buf, len); + ib->free -= len; + ib->ptr += len; + + if(vbuf->tail != vbuf->array && m == + (const char*)((vbuf->tail-1)->iov_base) + (vbuf->tail-1)->iov_len) { + (vbuf->tail-1)->iov_len += len; + return 0; + } else { + return msgpack_vrefbuffer_append_ref(vbuf, m, len); + } +} + +int msgpack_vrefbuffer_migrate(msgpack_vrefbuffer* vbuf, msgpack_vrefbuffer* to) +{ + size_t sz = vbuf->chunk_size; + msgpack_vrefbuffer_chunk* empty; + + if((sizeof(msgpack_vrefbuffer_chunk) + sz) < sz){ + return -1; + } + + empty = (msgpack_vrefbuffer_chunk*)malloc( + sizeof(msgpack_vrefbuffer_chunk) + sz); + if(empty == NULL) { + return -1; + } + + empty->next = NULL; + + { + const size_t nused = (size_t)(vbuf->tail - vbuf->array); + if(to->tail + nused < vbuf->end) { + msgpack_iovec* nvec; + const size_t tosize = (size_t)(to->tail - to->array); + const size_t reqsize = nused + tosize; + size_t nnext = (size_t)(to->end - to->array) * 2; + while(nnext < reqsize) { + size_t tmp_nnext = nnext * 2; + if (tmp_nnext <= nnext) { + nnext = reqsize; + break; + } + nnext = tmp_nnext; + } + + nvec = (msgpack_iovec*)realloc( + to->array, sizeof(msgpack_iovec)*nnext); + if(nvec == NULL) { + free(empty); + return -1; + } + + to->array = nvec; + to->end = nvec + nnext; + to->tail = nvec + tosize; + } + + memcpy(to->tail, vbuf->array, sizeof(msgpack_iovec)*nused); + + to->tail += nused; + vbuf->tail = vbuf->array; + + { + msgpack_vrefbuffer_inner_buffer* const ib = &vbuf->inner_buffer; + msgpack_vrefbuffer_inner_buffer* const toib = &to->inner_buffer; + + msgpack_vrefbuffer_chunk* last = ib->head; + while(last->next != NULL) { + last = last->next; + } + last->next = toib->head; + toib->head = ib->head; + + if(toib->free < ib->free) { + toib->free = ib->free; + toib->ptr = ib->ptr; + } + + ib->head = empty; + ib->free = sz; + ib->ptr = ((char*)empty) + sizeof(msgpack_vrefbuffer_chunk); + } + } + + return 0; +} diff --git a/c/third_party/msgpack/src/zone.c b/c/third_party/msgpack/src/zone.c new file mode 100644 index 0000000..372a1f5 --- /dev/null +++ b/c/third_party/msgpack/src/zone.c @@ -0,0 +1,222 @@ +/* + * MessagePack for C memory pool implementation + * + * Copyright (C) 2008-2009 FURUHASHI Sadayuki + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +#include "msgpack/zone.h" +#include +#include + +struct msgpack_zone_chunk { + struct msgpack_zone_chunk* next; + /* data ... */ +}; + +static inline bool init_chunk_list(msgpack_zone_chunk_list* cl, size_t chunk_size) +{ + msgpack_zone_chunk* chunk = (msgpack_zone_chunk*)malloc( + sizeof(msgpack_zone_chunk) + chunk_size); + if(chunk == NULL) { + return false; + } + + cl->head = chunk; + cl->free = chunk_size; + cl->ptr = ((char*)chunk) + sizeof(msgpack_zone_chunk); + chunk->next = NULL; + + return true; +} + +static inline void destroy_chunk_list(msgpack_zone_chunk_list* cl) +{ + msgpack_zone_chunk* c = cl->head; + while(true) { + msgpack_zone_chunk* n = c->next; + free(c); + if(n != NULL) { + c = n; + } else { + break; + } + } +} + +static inline void clear_chunk_list(msgpack_zone_chunk_list* cl, size_t chunk_size) +{ + msgpack_zone_chunk* c = cl->head; + while(true) { + msgpack_zone_chunk* n = c->next; + if(n != NULL) { + free(c); + c = n; + } else { + cl->head = c; + break; + } + } + cl->head->next = NULL; + cl->free = chunk_size; + cl->ptr = ((char*)cl->head) + sizeof(msgpack_zone_chunk); +} + +void* msgpack_zone_malloc_expand(msgpack_zone* zone, size_t size) +{ + msgpack_zone_chunk_list* const cl = &zone->chunk_list; + msgpack_zone_chunk* chunk; + + size_t sz = zone->chunk_size; + + while(sz < size) { + size_t tmp_sz = sz * 2; + if (tmp_sz <= sz) { + sz = size; + break; + } + sz = tmp_sz; + } + + chunk = (msgpack_zone_chunk*)malloc( + sizeof(msgpack_zone_chunk) + sz); + if (chunk == NULL) { + return NULL; + } + else { + char* ptr = ((char*)chunk) + sizeof(msgpack_zone_chunk); + chunk->next = cl->head; + cl->head = chunk; + cl->free = sz - size; + cl->ptr = ptr + size; + + return ptr; + } +} + + +static inline void init_finalizer_array(msgpack_zone_finalizer_array* fa) +{ + fa->tail = NULL; + fa->end = NULL; + fa->array = NULL; +} + +static inline void call_finalizer_array(msgpack_zone_finalizer_array* fa) +{ + msgpack_zone_finalizer* fin = fa->tail; + for(; fin != fa->array; --fin) { + (*(fin-1)->func)((fin-1)->data); + } +} + +static inline void destroy_finalizer_array(msgpack_zone_finalizer_array* fa) +{ + call_finalizer_array(fa); + free(fa->array); +} + +static inline void clear_finalizer_array(msgpack_zone_finalizer_array* fa) +{ + call_finalizer_array(fa); + fa->tail = fa->array; +} + +bool msgpack_zone_push_finalizer_expand(msgpack_zone* zone, + void (*func)(void* data), void* data) +{ + msgpack_zone_finalizer_array* const fa = &zone->finalizer_array; + msgpack_zone_finalizer* tmp; + + const size_t nused = (size_t)(fa->end - fa->array); + + size_t nnext; + if(nused == 0) { + nnext = (sizeof(msgpack_zone_finalizer) < 72/2) ? + 72 / sizeof(msgpack_zone_finalizer) : 8; + + } else { + nnext = nused * 2; + } + + tmp = (msgpack_zone_finalizer*)realloc(fa->array, + sizeof(msgpack_zone_finalizer) * nnext); + if(tmp == NULL) { + return false; + } + + fa->array = tmp; + fa->end = tmp + nnext; + fa->tail = tmp + nused; + + fa->tail->func = func; + fa->tail->data = data; + + ++fa->tail; + + return true; +} + + +bool msgpack_zone_is_empty(msgpack_zone* zone) +{ + msgpack_zone_chunk_list* const cl = &zone->chunk_list; + msgpack_zone_finalizer_array* const fa = &zone->finalizer_array; + return cl->free == zone->chunk_size && cl->head->next == NULL && + fa->tail == fa->array; +} + + +void msgpack_zone_destroy(msgpack_zone* zone) +{ + destroy_finalizer_array(&zone->finalizer_array); + destroy_chunk_list(&zone->chunk_list); +} + +void msgpack_zone_clear(msgpack_zone* zone) +{ + clear_finalizer_array(&zone->finalizer_array); + clear_chunk_list(&zone->chunk_list, zone->chunk_size); +} + +bool msgpack_zone_init(msgpack_zone* zone, size_t chunk_size) +{ + zone->chunk_size = chunk_size; + + if(!init_chunk_list(&zone->chunk_list, chunk_size)) { + return false; + } + + init_finalizer_array(&zone->finalizer_array); + + return true; +} + +msgpack_zone* msgpack_zone_new(size_t chunk_size) +{ + msgpack_zone* zone = (msgpack_zone*)malloc( + sizeof(msgpack_zone)); + if(zone == NULL) { + return NULL; + } + + zone->chunk_size = chunk_size; + + if(!init_chunk_list(&zone->chunk_list, chunk_size)) { + free(zone); + return NULL; + } + + init_finalizer_array(&zone->finalizer_array); + + return zone; +} + +void msgpack_zone_free(msgpack_zone* zone) +{ + if(zone == NULL) { return; } + msgpack_zone_destroy(zone); + free(zone); +} diff --git a/c/third_party/sha256/sha256.c b/c/third_party/sha256/sha256.c new file mode 100644 index 0000000..29b81ee --- /dev/null +++ b/c/third_party/sha256/sha256.c @@ -0,0 +1,158 @@ +/********************************************************************* +* Filename: sha256.c +* Author: Brad Conte (brad AT bradconte.com) +* Copyright: +* Disclaimer: This code is presented "as is" without any guarantees. +* Details: Implementation of the SHA-256 hashing algorithm. + SHA-256 is one of the three algorithms in the SHA2 + specification. The others, SHA-384 and SHA-512, are not + offered in this implementation. + Algorithm specification can be found here: + * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf + This implementation uses little endian byte order. +*********************************************************************/ + +/*************************** HEADER FILES ***************************/ +#include +#include +#include "sha256.h" + +/****************************** MACROS ******************************/ +#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) +#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b)))) + +#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z))) +#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) +#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22)) +#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25)) +#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3)) +#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10)) + +/**************************** VARIABLES *****************************/ +static const WORD k[64] = { + 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, + 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, + 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, + 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, + 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, + 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, + 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +}; + +/*********************** FUNCTION DEFINITIONS ***********************/ +void sha256_transform(SHA256_CTX *ctx, const BYTE data[]) +{ + WORD a, b, c, d, e, f, g, h, i, j, t1, t2, m[64]; + + for (i = 0, j = 0; i < 16; ++i, j += 4) + m[i] = ((WORD)data[j] << 24) | ((WORD)data[j + 1] << 16) | ((WORD)data[j + 2] << 8) | ((WORD)data[j + 3]); + for ( ; i < 64; ++i) + m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16]; + + a = ctx->state[0]; + b = ctx->state[1]; + c = ctx->state[2]; + d = ctx->state[3]; + e = ctx->state[4]; + f = ctx->state[5]; + g = ctx->state[6]; + h = ctx->state[7]; + + for (i = 0; i < 64; ++i) { + t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i]; + t2 = EP0(a) + MAJ(a,b,c); + h = g; + g = f; + f = e; + e = d + t1; + d = c; + c = b; + b = a; + a = t1 + t2; + } + + ctx->state[0] += a; + ctx->state[1] += b; + ctx->state[2] += c; + ctx->state[3] += d; + ctx->state[4] += e; + ctx->state[5] += f; + ctx->state[6] += g; + ctx->state[7] += h; +} + +void sha256_init(SHA256_CTX *ctx) +{ + ctx->datalen = 0; + ctx->bitlen = 0; + ctx->state[0] = 0x6a09e667; + ctx->state[1] = 0xbb67ae85; + ctx->state[2] = 0x3c6ef372; + ctx->state[3] = 0xa54ff53a; + ctx->state[4] = 0x510e527f; + ctx->state[5] = 0x9b05688c; + ctx->state[6] = 0x1f83d9ab; + ctx->state[7] = 0x5be0cd19; +} + +void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len) +{ + WORD i; + + for (i = 0; i < len; ++i) { + ctx->data[ctx->datalen] = data[i]; + ctx->datalen++; + if (ctx->datalen == 64) { + sha256_transform(ctx, ctx->data); + ctx->bitlen += 512; + ctx->datalen = 0; + } + } +} + +void sha256_final(SHA256_CTX *ctx, BYTE hash[]) +{ + WORD i; + + i = ctx->datalen; + + // Pad whatever data is left in the buffer. + if (ctx->datalen < 56) { + ctx->data[i++] = 0x80; + while (i < 56) + ctx->data[i++] = 0x00; + } + else { + ctx->data[i++] = 0x80; + while (i < 64) + ctx->data[i++] = 0x00; + sha256_transform(ctx, ctx->data); + memset(ctx->data, 0, 56); + } + + // Append to the padding the total message's length in bits and transform. + ctx->bitlen += ctx->datalen * 8; + ctx->data[63] = ctx->bitlen; + ctx->data[62] = ctx->bitlen >> 8; + ctx->data[61] = ctx->bitlen >> 16; + ctx->data[60] = ctx->bitlen >> 24; + ctx->data[59] = ctx->bitlen >> 32; + ctx->data[58] = ctx->bitlen >> 40; + ctx->data[57] = ctx->bitlen >> 48; + ctx->data[56] = ctx->bitlen >> 56; + sha256_transform(ctx, ctx->data); + + // Since this implementation uses little endian byte ordering and SHA uses big endian, + // reverse all the bytes when copying the final state to the output hash. + for (i = 0; i < 4; ++i) { + hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; + hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; + hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; + hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; + hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; + hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff; + hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff; + hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff; + } +} diff --git a/c/third_party/sha256/sha256.h b/c/third_party/sha256/sha256.h new file mode 100644 index 0000000..7123a30 --- /dev/null +++ b/c/third_party/sha256/sha256.h @@ -0,0 +1,34 @@ +/********************************************************************* +* Filename: sha256.h +* Author: Brad Conte (brad AT bradconte.com) +* Copyright: +* Disclaimer: This code is presented "as is" without any guarantees. +* Details: Defines the API for the corresponding SHA1 implementation. +*********************************************************************/ + +#ifndef SHA256_H +#define SHA256_H + +/*************************** HEADER FILES ***************************/ +#include + +/****************************** MACROS ******************************/ +#define SHA256_BLOCK_SIZE 32 // SHA256 outputs a 32 byte digest + +/**************************** DATA TYPES ****************************/ +typedef unsigned char BYTE; // 8-bit byte +typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines + +typedef struct { + BYTE data[64]; + WORD datalen; + unsigned long long bitlen; + WORD state[8]; +} SHA256_CTX; + +/*********************** FUNCTION DECLARATIONS **********************/ +void sha256_init(SHA256_CTX *ctx); +void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len); +void sha256_final(SHA256_CTX *ctx, BYTE hash[]); + +#endif // SHA256_H diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..f996137 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,29 @@ +# DaggerML docs + +This `docs/` tree is the human-facing documentation for DaggerML. + +If you want change proposals, task lists, or planning artifacts for work on the repo, look in `openspec/` instead. `openspec/` is for change planning; `docs/` is for learning and using the project. + +## Start here + +- [Getting started](getting-started.md): install DaggerML, create a repo, make a first DAG, and inspect it. +- [Concepts](concepts/README.md): the core ideas behind DAGs, commits, refs, execution, storage, remotes, and values. +- [Guides](guides/README.md): task-focused walkthroughs built around real DaggerML workflows. +- [Reference](reference/README.md): the exact Python API, CLI, configuration, and error surfaces. +- [Architecture](architecture/README.md): how the system is put together internally for advanced readers and contributors. +- [Contrib](contrib/README.md): contrib-specific APIs, runtime pieces, and supporting docs. + +## What DaggerML exposes + +- A Python API centered on `Dml`, `Dag`, `Node`, `Ref`, and helpers such as `new()` and `load()`. +- A CLI centered on `dml` commands for repo, DAG, commit, config, and related inspection workflows. +- A contrib package for adapters, executors, codecs, and helper APIs that extend the core runtime. + +## Reading path + +Start with [Getting started](getting-started.md) if you want a first working repo. After that, move by question: + +- use [Concepts](concepts/README.md) for mental models +- use [Guides](guides/README.md) for workflows +- use [Reference](reference/README.md) for exact commands and APIs +- use [Architecture](architecture/README.md) when you need the implementation picture diff --git a/docs/architecture/README.md b/docs/architecture/README.md new file mode 100644 index 0000000..8877263 --- /dev/null +++ b/docs/architecture/README.md @@ -0,0 +1,12 @@ +# Architecture + +This section is for readers who already know what DaggerML does and want to see how the pieces are put together. + +- [System overview](system-overview.md): the main layers, data flow, and where public APIs hand work to the internals. +- [Internal modules](internal-modules.md): a map of the main packages and files in `src/daggerml/`. +- [Ops layer](ops-layer.md): the transactional subsystems that implement repo behavior. +- [Storage internals](storage-internals.md): how refs, namespaces, LMDB storage, and on-disk pointers fit together. +- [Remote protocol](remote-protocol.md): how S3-backed CAS, refs, manifests, and execution metadata work. +- [Type system](type-system.md): the dataclasses, namespace registry, and validation rules that shape persisted state. + +These pages describe the current implementation. They stay close to the real module layout so contributors can move between the docs and the code without translation. diff --git a/docs/architecture/internal-modules.md b/docs/architecture/internal-modules.md new file mode 100644 index 0000000..a1d7ad3 --- /dev/null +++ b/docs/architecture/internal-modules.md @@ -0,0 +1,61 @@ +# Internal Modules + +Most of DaggerML's implementation lives under `src/daggerml/`, but the important architectural boundary is not "public package vs everything else." It is "ergonomic wrappers vs transactional repository core." + +## Top-level package map + +- `api.py`: the main Python-facing interface. It exposes the friendly `Dag` wrapper, default-runtime helpers, and node objects that feel natural in Python code. +- `codecs.py`: value staging and loading for Python objects that do not map directly onto the internal dataclasses. +- `_cli.py`: the CLI surface. It mostly translates command-line intent into `Dml` method calls. +- `_internal/`: the repository engine. +- `contrib/`: adapters, executors, codecs, and helper surfaces that extend the core runtime. + +## The `_internal` package + +`_internal` is where DaggerML stops looking like a user library and starts looking like a small versioned object database with execution support. + +### Runtime and context + +- `dml.py`: defines `Dml`, the central orchestration object. It exposes namespaces such as `runtime`, `dag`, `config`, and `admin`, opens the DB, and composes ops objects for each request. +- `dml_context.py`: resolves runtime context from config and environment, including project paths, remote roots, and user identity. +- `config.py`: config models and validation. +- `dml_resolution.py`: revision, DAG, and node selector resolution used by the user-facing commands. +- `revision_uri.py`: the parser and validator for `dml://owner/project#branch` and `@tag` selectors. + +### Domain model + +- `types.py`: the typed object model for everything persisted locally: datums, runnable specs, errors, nodes, DAGs, trees, and commits. +- `builtins.py`: built-in function implementations used before the runtime falls back to adapter execution. + +### Repository subsystems + +- `ops/base_ops.py`: shared transaction, object IO, and retry machinery. +- `ops/head.py`: HEAD state, branches, indexes, and tracking refs on disk. +- `ops/commit.py`: commit creation and history operations. +- `ops/index.py`: mutable workspaces, DAG building, function execution, cancellation, and cache publication. +- `ops/dag.py`: read access to finished DAGs. +- `ops/node.py`: value retrieval and datum unrolling. +- `ops/cache.py`: the cache-facing bridge between local argv refs and remote cache refs. +- `ops/remote.py`: remote CAS, refs, manifests, project sync, execution invalidation, and remote GC. +- `ops/gc.py`: local orphan discovery and deletion. +- `ops/config.py`: config file editing. This one sits near the ops layer conceptually, but it works on TOML files rather than LMDB transactions. + +### Persistence and execution helpers + +- `_db.pyx`: the Cython wrapper around the C LMDB layer. It handles typed value conversion, transactions, object iteration, and orphan listing. +- `exec_state.py`: S3-backed execution lock, launch state, execution state, lineage edges, and adapter IO helpers. +- `execution_context.py`: contextvars for the current execution id and cache key. +- `util.py`: timestamps, IDs, and smaller shared helpers. + +## Why the split looks this way + +The public API is intentionally thin. It does not reimplement repository behavior; it mostly stages values and delegates. That keeps the rules for storage, history, caching, and sync in one place: the internal runtime and ops layer. + +The `_internal` package is also split so that contributors can reason locally: + +- if a change is about persisted shape, look at `types.py` and `_db.pyx`, +- if it is about repo behavior, look at `ops/`, +- if it is about how callers discover or target state, look at `dml.py`, `dml_context.py`, and the resolution helpers, +- if it is about remote execution coordination, look at `ops/remote.py` and `exec_state.py` together. + +The next page, [ops layer](ops-layer.md), covers how those subsystems interact in practice. diff --git a/docs/architecture/ops-layer.md b/docs/architecture/ops-layer.md new file mode 100644 index 0000000..c99d0cc --- /dev/null +++ b/docs/architecture/ops-layer.md @@ -0,0 +1,76 @@ +# Ops Layer + +The ops layer is the repository's working core. Each class in `src/daggerml/_internal/ops/` owns a narrow slice of behavior, but they all share the same transaction and object model. + +## Shared foundation + +`BaseOps` in `base_ops.py` does three jobs for the whole layer: + +- opens read or write transactions through `_tx()`, +- serializes and deserializes typed objects through `TxnContext.put()` and `TxnContext.get()`, +- retries recoverable DB failures such as map growth or environment reopen events. + +That means the higher-level ops classes can focus on branch rules, commit structure, or execution flow instead of raw LMDB details. + +## Pointer and history subsystems + +`HeadOps` manages the lightweight filesystem pointers under `.dml/`. It owns: + +- the current `HEAD` state, +- local branches under `.dml/refs/local/heads/`, +- mutable index pointers under `.dml/refs/local/indexes/`, +- remote-tracking refs under `.dml/refs/remote/`. + +`CommitOps` works one layer below that. It reads and writes `Commit` and `Tree` objects, computes DAG-map diffs, and implements merge, rebase, and revert behavior. In practice, `HeadOps` answers "what commit should I start from?" and `CommitOps` answers "what new commit should exist after this operation?" + +## Read-only graph access + +`DagOps` and `NodeOps` are the read side of the repository model. + +- `DagOps` lists DAGs, describes finished DAGs, and resolves named nodes plus argv and kwargv nodes. +- `NodeOps` reads a node's value and can fully unroll datum refs into plain Python values. + +These modules are intentionally smaller than `IndexOps` because they do not create new graph state. They mostly interpret committed objects. + +## Mutable workspace and execution + +`IndexOps` is the busiest subsystem. + +An index is DaggerML's mutable workspace: it points at a commit, carries an in-progress DAG, and lets the runtime add literals, imports, function calls, names, results, and errors before turning that state back into a commit. + +`IndexOps` also owns function execution. Its flow is roughly: + +1. Prepare argv and kwargv nodes in the mutable DAG. +2. Try a built-in function path from `builtins.py`. +3. Check the remote-backed cache through `CacheOps`. +4. If needed, publish an argv manifest through `RemoteOps`. +5. Coordinate execution through `ExecutionState`. +6. Finish by importing the resulting DAG back into the caller's index. + +This is why `IndexOps` depends on many neighbors: `HeadOps` for index pointers, `DagOps` and `NodeOps` for graph inspection, `CacheOps` for cache lookup, and `RemoteOps` for manifest publication and remote state. + +## Cache, remote sync, and GC + +`CacheOps` is a small but important bridge. Locally, it understands that a cache key comes from the argv datum id. Remotely, it relies on `RemoteOps` to read and publish cache refs. It keeps cache identity logic close to the repository model instead of scattering it through adapters. + +`RemoteOps` is much broader. It handles: + +- manifest publication and materialization, +- cache refs and DAG refs, +- branch and tag sync for `dml://` project URIs, +- execution invalidation and cancellation metadata stored remotely, +- remote prune and mark-and-sweep GC. + +`GcOps` is the local cleanup partner. It asks the DB layer for objects not reachable from branch, index, or HEAD roots and deletes them in a write transaction. + +## Relationship summary + +- `HeadOps` points at commits and indexes. +- `CommitOps` changes repository history. +- `IndexOps` creates and mutates in-progress DAG state. +- `DagOps` and `NodeOps` read finished graph state. +- `CacheOps` maps argv identity to remote cache refs. +- `RemoteOps` moves object graphs across the network boundary. +- `GcOps` removes unreachable local objects. + +The common theme is that all of them manipulate the same typed objects and refs. The layer is split by responsibility, not by separate storage backends or separate schemas. diff --git a/docs/architecture/remote-protocol.md b/docs/architecture/remote-protocol.md new file mode 100644 index 0000000..c47447b --- /dev/null +++ b/docs/architecture/remote-protocol.md @@ -0,0 +1,90 @@ +# Remote Protocol + +DaggerML's remote layer is implemented in `src/daggerml/_internal/ops/remote.py` and `src/daggerml/_internal/exec_state.py`. It uses S3 as a content-addressed store plus a small ref namespace. + +## The remote surface + +When `RemoteOps` initializes, it expects or creates a `dml.json` descriptor describing the `cas+refs` layout. Under that prefix, the important areas are: + +- `cas/sha256///`: immutable content-addressed objects, +- `refs/tags/**`, `refs/cache/**`, and `refs/projects/**`: published refs, +- `refs/dags/.json`: per-DAG manifests used to recurse across DAG boundaries, +- `exec/**` and `io/**`: execution-state and adapter-transport data. + +The remote side is not a second copy of the local repo layout. It is a transport-oriented layout designed around manifests, refs, and immutable blobs. + +The descriptor currently declares a schema version, the `sha256` hash family, and the canonical `refs`, `io`, and `cas/sha256` prefixes. That gives clients one stable description of how to interpret a remote root before they start reading any manifests or refs. + +## Manifest and ref shape + +The remote protocol revolves around two small JSON object families: + +- manifests, which name a root object plus the closure needed to materialize it +- refs, which publish a manifest under a discoverable path such as a branch, tag, cache key, or DAG id + +In practice, refs also carry just enough metadata for the runtime to preserve direct DAG relationships and execution provenance. Cache refs record the execution id that published them, branch refs distinguish mutable heads from immutable tags, and per-DAG refs let nested DAG boundaries stay visible instead of collapsing into one giant closure. + +## Push in plain terms + +Pushing means taking a local root object, describing the closure needed to reconstruct it, uploading any missing CAS blobs, then publishing a ref that points at the manifest. + +For branch and tag sync, the root is a commit. For cache publication, the root is usually a DAG. In both cases `RemoteOps`: + +1. walks the local object graph into a local manifest, +2. derives the direct child DAG ids for that manifest layer, +3. uploads missing CAS blobs after verifying SHA-256, +4. writes per-DAG refs for child DAG manifests as needed, +5. publishes the top-level ref with `targets` metadata for the direct DAG ids. + +That direct-DAG metadata is how the remote side keeps nested DAG relationships visible without flattening the entire graph into one manifest. + +## Pull in plain terms + +Pulling starts from a remote ref, not from an object id guessed by the client. + +`RemoteOps` reads the ref JSON, validates it, loads the target manifest, and then materializes the closure into the local database. If the manifest mentions child DAG ids, it follows `refs/dags/*.json` to load those manifests too. The implementation uses a thread pool so independent manifests and CAS objects can be fetched concurrently, but the resulting objects are still materialized into one local transaction path. + +At the end of a branch or tag fetch, DaggerML writes a local tracking pointer rather than treating the remote state as separately mounted storage. + +## Project sync vs cache sync + +The same remote machinery supports two different user-facing stories. + +### Project sync + +Project sync uses canonical `dml://owner/project#branch` and `@tag` URIs. `RemoteOps` validates the URI pieces with `revision_uri.py`, maps them onto `refs/projects/...`, and then enforces branch or tag semantics: + +- branch refs can be updated conditionally, +- non-fast-forward branch pushes are rejected unless `force` is set, +- tag refs are immutable once created. + +### Cache sync + +Cache sync uses `refs/cache/.json`. The cache key comes from the argv datum id, which means the remote cache is keyed by the normalized execution input graph rather than by a separate ad hoc hash layer in Python. + +## Execution metadata + +`ExecutionState` adds a second remote protocol surface next to CAS and refs. It stores: + +- advisory locks, +- the active execution id for a cache key, +- launch state for resumable work, +- execution lifecycle records, +- caller/callee lineage edges, +- invalidation and cancellation tombstones, +- adapter IO objects under `io/...`. + +This is what makes async and detached execution workable across processes. The local repo still owns typed DAG state, but the remote side owns the coordination data needed to produce or invalidate that DAG state safely. + +At the adapter boundary, the runtime publishes and reloads JSON transport blobs rather than passing Python objects directly. Those payloads carry the runnable, argv pointer, cache key, execution id, remote config, execution lifecycle status, and resume state needed for later polls. Adapters answer with a similarly small status payload so the runtime can decide whether to keep waiting, import a successful DAG, or materialize a failure. + +## Remote cleanup + +Remote cleanup is split in two on purpose: + +- `prune()` removes old transport blobs under `io/invoke/**`, which are ephemeral. +- `gc()` performs mark-and-sweep over CAS objects using published refs as roots. + +That distinction matters because transport blobs are not part of the durable repository graph, while manifests and CAS objects are. + +For the public-facing model of remotes, also see `docs/concepts/remotes.md` once the concepts lane is rebuilt. diff --git a/docs/architecture/storage-internals.md b/docs/architecture/storage-internals.md new file mode 100644 index 0000000..951d0ee --- /dev/null +++ b/docs/architecture/storage-internals.md @@ -0,0 +1,83 @@ +# Storage Internals + +DaggerML stores its real object graph in LMDB, but the repo is not LMDB alone. The `.dml/` directory mixes database state with a small filesystem-level pointer layer. + +## Two kinds of state + +### 1. Typed objects in LMDB + +The database lives at `.dml/db`. `src/daggerml/_internal/_db.pyx` wraps the lower-level C implementation and exposes: + +- transactions, +- namespace-aware put/get/delete/exists operations, +- iteration by namespace, +- orphan listing for GC, +- raw dump and load support for manifest-based transfer. + +Objects are stored by namespace and object id. The Python side treats every stored object as a `Ref("namespace:id")`, then decodes the payload through the namespace registry in `types.py`. + +### 2. Filesystem pointers in `.dml/` + +Some state is easier to manage as small files than as database objects: + +- `.dml/HEAD`: attached or detached head state, +- `.dml/refs/local/heads/*`: branch pointers, +- `.dml/refs/local/indexes/*`: mutable workspace pointers, +- `.dml/refs/remote/*`: tracking refs for fetched remote branches and tags, +- `.dml/config.toml`: project configuration. + +That split gives DaggerML a Git-like control plane without forcing branch and checkout logic into the object store itself. + +## Namespace-based identity + +The object model depends on namespace prefixes being meaningful, not decorative. A ref like `dag:...` is different from `node-fn:...` or `datum-dict:...`, and code validates those expectations constantly. + +This shows up in several places: + +- `TxnContext.get()` uses `ref.ns()` to choose the Python class for deserialization. +- `require_ref()` in `types.py` checks namespace hierarchies such as `node` or `datum`. +- `TxnContext.put()` validates objects before writing and rejects unknown namespaces. + +## Object graph shape + +Committed state is built from a small set of object families: + +- `Commit` points at a `Tree` and optional focal DAG. +- `Tree` maps names to DAG refs. +- `Dag` holds node refs, names, an optional result, an optional error, and optional argv state. +- `Node` objects point to datums directly or indirectly. +- `Datum` objects hold scalar values, collections of datum refs, URIs, or runnable specs. + +Because all references are explicit, reachability is also explicit. Local GC starts from pointer roots discovered by `HeadOps` and asks the DB layer which objects are no longer reachable. + +## Transactions and retries + +The DB layer is transactional, and the Python code leans on that heavily. Write operations are expected to either produce a whole coherent object graph or fail cleanly. + +`BaseOps.with_retry` handles the two recoverable cases the DB layer can report during normal operation: + +- the LMDB map needs to grow, +- the environment was reopened and the caller should retry the transaction. + +This keeps the storage layer robust without forcing each subsystem to reimplement retry logic. + +## Local manifests + +Remote publication and cross-process execution use a `local-manifest` shape produced from local storage. In practice this is a transport bundle containing: + +- the root namespace and id, +- a closure of raw object dumps grouped by namespace, +- direct child DAG ids when the graph crosses a DAG boundary. + +The important detail is that local manifests do not inline every child DAG recursively into one huge blob. They stop at child DAG refs and let the remote layer publish or resolve those DAGs separately. That keeps DAG boundaries visible in transport as well as in local storage. + +## Why this design matters + +The storage model is simple enough to reason about directly: + +- the database stores immutable typed objects, +- refs and namespaces define identity, +- filesystem pointers choose which commits and indexes are live, +- manifests move closures across process and remote boundaries. + +That simplicity is what lets the higher layers compose history, execution, caching, and sync without inventing separate persistence rules for each feature. diff --git a/docs/architecture/system-overview.md b/docs/architecture/system-overview.md new file mode 100644 index 0000000..cf7667e --- /dev/null +++ b/docs/architecture/system-overview.md @@ -0,0 +1,40 @@ +# System Overview + +DaggerML has a small public surface and a fairly dense internal core. + +At the top, `daggerml.api` gives Python users a convenient interface built around `Dml`, `Dag`, and `Node`. The CLI layers on top of the same runtime ideas through `src/daggerml/_cli.py`. Under that, `daggerml._internal.dml.Dml` is the main orchestration boundary: it resolves config, opens the database, and routes requests to narrower subsystems. + +The internals are organized in four broad layers: + +1. Public entrypoints: `daggerml.api`, `_cli.py`, and the codec helpers in `daggerml.codecs`. +2. Runtime orchestration: `src/daggerml/_internal/dml.py`, plus config and revision-resolution helpers. +3. Repository subsystems: `src/daggerml/_internal/ops/*.py`. +4. Persistence and typed state: `src/daggerml/_internal/types.py` and `src/daggerml/_internal/_db.pyx`. + +## How a local write flows + +Creating or updating a DAG usually starts in `daggerml.api`. The high-level `Dag` wrapper stages Python values through `daggerml.codecs`, then calls runtime methods on `Dml.runtime`. Those runtime methods open a database handle and delegate to `IndexOps`, `DagOps`, or `NodeOps` depending on the job. + +Inside the ops layer, work happens inside explicit LMDB transactions. `BaseOps` provides the shared transaction wrapper, object reads and writes, and manifest load helpers. The concrete ops classes apply domain rules on top of that shared machinery: branch pointers in `HeadOps`, history in `CommitOps`, mutable workspaces and execution in `IndexOps`, and so on. + +The storage layer is deliberately simple: objects live in namespace-partitioned LMDB tables and are addressed by `Ref("namespace:id")`. The filesystem around `.dml/` stores lightweight pointers such as `HEAD` and branch or index refs, while the object graph itself stays in LMDB. + +## How execution flows + +Function execution is centered in `IndexOps.start_fn()`. It prepares an argv node, tries built-in execution first, then checks the remote-backed cache through `CacheOps`. If there is no cached result, it publishes an argument manifest through `RemoteOps`, coordinates a run with `ExecutionState`, and waits for either a finished cached DAG or an in-progress execution to resume later. + +That split is important to the architecture: + +- local LMDB storage keeps the typed repository state, +- remote storage carries cache refs, manifests, and transport state, +- `ExecutionState` in `exec_state.py` handles the advisory lock and execution lineage needed for async adapters. + +## How history flows + +Committed repository state is modeled as `Commit -> Tree -> Dag -> Node -> Datum/Error`. `HeadOps` points the working repo at a commit through `HEAD`, branch refs, and index refs. `CommitOps` creates new commits, walks history, and handles merge, rebase, and revert logic. `DagOps` and `NodeOps` then give read access to the objects inside those commits. + +## How remote sync fits in + +Remote sync is not a separate storage engine. It is a transport and publication layer around the local object graph. `RemoteOps` takes local objects, builds manifests, uploads missing CAS objects to S3, and publishes refs under the remote `refs/` tree. Pulling does the reverse: resolve a remote ref, fetch the needed manifests and CAS objects, materialize them locally, then update the appropriate tracking pointer. + +For more detail, continue with [internal modules](internal-modules.md) and [ops layer](ops-layer.md). diff --git a/docs/architecture/type-system.md b/docs/architecture/type-system.md new file mode 100644 index 0000000..c154194 --- /dev/null +++ b/docs/architecture/type-system.md @@ -0,0 +1,82 @@ +# Type System + +The type system in `src/daggerml/_internal/types.py` is the contract that keeps the repository coherent. It is less about static typing in Python and more about making sure persisted objects have a predictable shape and reference each other correctly. + +## Namespace registry first + +Everything starts with `NAMESPACES`, a runtime registry that maps namespace strings to Python classes. + +Two families register themselves automatically: + +- `Datum` subclasses become `datum-*` namespaces, +- `Node` subclasses become `node-*` namespaces. + +Other persisted objects such as `Dag`, `Tree`, `Commit`, `Error`, and `Deletable` register through `_register_dml_obj`. + +This is what lets the storage layer turn `Ref("commit:...")` or `Ref("datum-list:...")` back into the right dataclass on read. + +## Core object families + +### Datums + +Datums are the stored value layer. + +- `ScalarDatum` stores Python scalars. +- `ListDatum` and `DictDatum` store refs to other datums, not embedded Python objects. +- `Uri` stores external locations. +- `RunnableDatum` stores executable specifications in a repository-friendly form. + +One subtle but important split is `Runnable` vs `RunnableDatum`. + +- `Runnable` is the public, fully materialized Python object. +- `RunnableDatum` is the internal persisted form, where `target`, `sub`, and `kwargs` are refs to other stored objects. + +That split keeps the repository graph explicit even when the Python API exposes something friendlier. + +### Nodes + +Nodes are the computation layer. + +- `LiteralNode` points directly at a datum or error-like value. +- `ArgvNode` and `KwargvNode` are specialized literals used to anchor function inputs. +- `ImportNode` imports a value from another DAG. +- `FnNode` points at a child DAG created by a function call. + +Each node knows how to turn itself into a datum ref through `datum_ref(txn)`, which is the common interface used by `NodeOps` and the execution path. + +### Graph and history objects + +- `Dag` gathers nodes, names, and either a result or an error. +- `Tree` is a named map of DAG refs. +- `Commit` is a versioned snapshot pointing at a tree and optional focal DAG. + +Together they form the repository's history model: commits snapshot trees, trees name DAGs, DAGs connect nodes, and nodes reach stored values. + +### Errors as data + +`Error` and `DmlRepoError` are not only in-memory exceptions. They are part of the persisted model too. A failed computation can be captured as structured error state with message, origin, type, and stack frames, then referenced from a DAG. + +That makes failures inspectable through the same object graph as successful results. + +## Validation strategy + +Every persisted object validates itself in `__post_init__()` by calling `_validate()`. + +In practice, validation checks three things: + +- field shapes are correct, +- refs point to the expected namespace family, +- impossible combinations, such as a DAG with both `result` and `error`, are rejected early. + +The helper `require_ref()` is used throughout the model to check both "is this a ref?" and "does its namespace hierarchy match what this field expects?" + +## Why contributors feel this layer everywhere + +The type system is not a side file. It affects almost every subsystem: + +- `BaseOps` depends on it for serialization and deserialization, +- the ops layer depends on it for field and namespace guarantees, +- remote manifests depend on stable namespace and object-id behavior, +- the public API depends on it when staging Python values into stored datums. + +If you are changing object shape, namespace rules, or how values cross the public/internal boundary, this is usually the first file to inspect. diff --git a/docs/concepts/README.md b/docs/concepts/README.md new file mode 100644 index 0000000..be340fb --- /dev/null +++ b/docs/concepts/README.md @@ -0,0 +1,15 @@ +# Concepts + +This section explains the core ideas behind DaggerML without turning into command or API reference. + +Start with [Overview](overview.md) for the big picture, then use the topic pages for the parts you need: + +- [DAGs and nodes](dags-and-nodes.md) +- [Commits and history](commits-and-history.md) +- [Refs and namespaces](refs-and-namespaces.md) +- [Execution](execution.md) +- [Storage](storage.md) +- [Remotes](remotes.md) +- [Codecs and values](codecs-and-values.md) + +If you want exact commands or method signatures, use the guides and reference sections instead. diff --git a/docs/concepts/codecs-and-values.md b/docs/concepts/codecs-and-values.md new file mode 100644 index 0000000..145cb0e --- /dev/null +++ b/docs/concepts/codecs-and-values.md @@ -0,0 +1,52 @@ +# Codecs and values + +Codecs are the bridge between ordinary Python values and the value model DaggerML stores in DAGs. + +## The stored value model + +At the storage layer, values are represented as datum objects such as: + +- scalar datums for `str`, `int`, `float`, `bool`, and `None` +- list and dict datums that point to other datum refs +- `Uri` datums for external locations +- `RunnableDatum` for executable values + +Nodes then point at those datums. A `LiteralNode` is the most direct example: it wraps a datum ref. + +## What codecs do + +Before a value is staged into a DAG, `daggerml.codecs` runs codec normalization. + +The codec flow: + +- finds the first matching codec by priority and registration order +- lets that codec encode the value +- re-applies codec matching if the encoded result changed +- recursively normalizes lists, dicts, and runnable fields + +This happens on public DAG staging and call-entry paths, not deep inside storage. + +## Built-in codec behavior + +The built-in codecs cover two especially important cases: + +- `NodeCodec` lets a node from the same DAG or another committed DAG be staged as a value +- `DelayedActionCodec` resolves delayed references, delayed loads, and delayed runnable construction into concrete staged values + +That is how cross-DAG imports and adapter-driven runnable resolution can feel natural in Python while still landing in the explicit stored model. + +## Plugin codecs + +Additional codecs can be discovered through the `daggerml.codecs` entry-point group. The registry loads them lazily and applies them deterministically. + +The practical mental model is simple: if a Python object is not already a plain stored value, a codec is the place where DaggerML learns how to turn it into one. + +## How to think about it + +Values enter DaggerML in Python shapes, but DAGs store a normalized graph of datum refs and node refs. Codecs are the translation layer that keeps those two worlds aligned. + +See also: + +- [DAGs and nodes](dags-and-nodes.md) +- [Execution](execution.md) +- [Storage](storage.md) diff --git a/docs/concepts/commits-and-history.md b/docs/concepts/commits-and-history.md new file mode 100644 index 0000000..6d037e9 --- /dev/null +++ b/docs/concepts/commits-and-history.md @@ -0,0 +1,55 @@ +# Commits and history + +DaggerML versions repositories in a git-like way, but the things being versioned are named DAG snapshots. + +## The main objects + +The history model centers on four pieces: + +- `Commit`: an immutable snapshot with parents, metadata, and a tree ref +- `Tree`: a mapping from DAG names to DAG refs +- branch refs: pointers to commits +- index refs: mutable working pointers rooted from a commit + +The tree is the bridge between history and DAGs. A commit does not store every DAG inline; it points at a tree, and the tree names the DAGs visible at that revision. + +## Working state versus recorded history + +When you start new work, DaggerML creates an index from a base commit. That index is where DAG-building operations happen. When you commit, DaggerML writes a new immutable commit and updates either: + +- the current branch pointer, or +- a detached output path for function-driven work when no branch update is requested + +This separation explains why repository operations feel familiar to source control while still supporting execution-oriented workflows. + +## Branches and HEAD + +`HeadOps` manages a HEAD file plus branch and index pointers under `.dml/refs/`. + +- An attached HEAD follows a branch. +- A detached HEAD points directly at a commit. +- Branch pointers move forward explicitly. +- Index pointers track mutable staging state separately from branches. + +In practice, branches answer "which commit is current?" while indexes answer "where is the work in progress?" + +## History shape + +Commits store parent refs, so ancestry defines history. Merge commits can have two parents. Rebase and merge operate by comparing and rewriting tree state, especially the mapping from DAG names to DAG refs. + +The interesting unit of change is usually not a single node. It is which named DAGs a commit's tree adds, removes, or replaces. + +## How to think about it + +If DAGs are the computation artifacts, commits are the repository timeline that organizes them. The usual reading flow is: + +1. resolve a branch or revision to a commit +2. read the commit's tree +3. look up the named DAG you care about +4. inspect the DAG and its nodes + +See also: + +- [DAGs and nodes](dags-and-nodes.md) +- [Refs and namespaces](refs-and-namespaces.md) +- [Remotes](remotes.md) diff --git a/docs/concepts/dags-and-nodes.md b/docs/concepts/dags-and-nodes.md new file mode 100644 index 0000000..02b5505 --- /dev/null +++ b/docs/concepts/dags-and-nodes.md @@ -0,0 +1,68 @@ +# DAGs and nodes + +In DaggerML, a DAG is the stored record of a computation, not just a planning structure. + +## What a DAG contains + +The core `Dag` type stores: + +- `nodes`: the node refs that make up the graph +- `names`: a mapping from user-facing names to node refs +- `result` or `error`: the terminal outcome +- optional `argv`: the node that captures call inputs for function DAGs + +That means a DAG can describe both simple literal work and a function execution with recorded inputs and outputs. + +## DAGs are immutable snapshots + +The public API lets you work with a mutable handle while building a DAG, but the persisted DAG objects are immutable snapshots. Internally, index operations advance by creating new DAG states rather than editing one in place. + +That is why the same repository has two layers of state: + +- an index for in-progress work +- DAG refs for finished snapshots + +Committing saves the current snapshot into history. Loading a committed DAG gives you a stable object you can inspect or import from, but not mutate. + +## Node kinds + +The stored node types map to a small set of roles: + +- `LiteralNode`: a literal value or error payload +- `ImportNode`: a node imported from another DAG +- `FnNode`: the result of calling a runnable with specific arguments +- `ArgvNode`: the positional call inputs for a function DAG +- `KwargvNode`: the keyword call inputs for a function DAG + +Most user code does not construct those classes directly. Instead, they appear as the result of assigning Python values, importing nodes across DAGs, or calling runnables. + +## Names are labels, not ownership + +`Dag.names` is a mapping from strings to node refs. A name points at a node; it does not create a second copy of the node. Multiple names can point at the same underlying node, and unnamed nodes can still exist as part of the graph. + +This matters when reading a DAG: + +- `nodes` is the full graph inventory +- `names` is the convenient lookup table +- `result` is the node treated as the DAG's final answer + +## Function calls produce nested DAG structure + +When you call a runnable, DaggerML creates a function DAG for that execution and a `FnNode` in the caller. The `FnNode` points at the called DAG, and the called DAG records its own `argv` and terminal result or error. + +That gives DaggerML a durable record of how one computation led to another. Cross-DAG links stay explicit instead of becoming hidden object pointers. + +## How to think about it + +It helps to think of a DAG as a replayable, inspectable computation artifact: + +- values become literal nodes +- function calls become function-result nodes +- imported results stay marked as imports +- the final answer is a named or unnamed node selected as `result` + +See also: + +- [Codecs and values](codecs-and-values.md) +- [Execution](execution.md) +- [Commits and history](commits-and-history.md) diff --git a/docs/concepts/execution.md b/docs/concepts/execution.md new file mode 100644 index 0000000..d43db50 --- /dev/null +++ b/docs/concepts/execution.md @@ -0,0 +1,66 @@ +# Execution + +Execution in DaggerML means turning a runnable plus its arguments into another DAG. + +## The input to execution + +The public `Runnable` value holds: + +- a `Uri` target +- an optional nested `sub` runnable +- keyword defaults +- an adapter name + +Before execution starts, DaggerML normalizes the call inputs, stages them into the working DAG, and builds: + +- an `ArgvNode` for positional inputs +- optionally a `KwargvNode` for keyword inputs + +Those nodes are not extra metadata bolted on later. They are part of the persisted computation record and also drive cache identity. + +## Builtins, cache, and adapters + +`IndexOps.start_fn(...)` follows three broad paths: + +1. Try a builtin implementation for supported `daggerml:` URIs. +2. If the call is not builtin, check for a cached DAG result keyed by the staged argv identity. +3. On a cache miss, coordinate adapter execution through remote-backed execution state. + +That means function execution is not just "run a process." It is a repository operation that first asks whether this exact computation already has a known DAG result. + +When execution crosses the adapter boundary, DaggerML sends a structured payload that includes the staged argv identity, the cache key derived from it, the runnable spec, remote settings, and any saved resume state. Adapter responses then fall into a small set of states: still running, succeeded with a DAG id, failed with an error payload, or detached from cancellation. + +The important design point is that adapters do not get to redefine execution identity. The argv-backed cache key and the stored execution records remain the source of truth. + +## Remote-backed execution state + +For non-builtin execution, DaggerML uses remote state to coordinate work: + +- a cache ref for completed results +- an active-execution pointer for in-flight work on the same cache key +- launch and lifecycle records for resume and status tracking + +If another caller is already driving the same computation, a later caller can detect that and resume or wait rather than launching duplicate work. + +## What success and failure mean + +On success, DaggerML expects a DAG result to appear in cache and then links it back into the caller as a `FnNode`. + +On failure, DaggerML materializes a failed DAG, publishes that terminal state, and raises the recorded DAG error back through the caller. + +So even failures become part of the graph model instead of disappearing into logs. + +## How to think about it + +Execution is best understood as DAG production with caching and coordination around it: + +- the call inputs become part of the DAG model +- the result is another DAG, not just an in-memory return value +- cache identity follows the staged call shape +- remote state lets multiple callers coordinate around the same work + +See also: + +- [DAGs and nodes](dags-and-nodes.md) +- [Remotes](remotes.md) +- [Codecs and values](codecs-and-values.md) diff --git a/docs/concepts/overview.md b/docs/concepts/overview.md new file mode 100644 index 0000000..71b0627 --- /dev/null +++ b/docs/concepts/overview.md @@ -0,0 +1,29 @@ +# Overview + +DaggerML combines a few ideas that fit together closely: + +- A DAG is an immutable record of one computation. +- Nodes are the individual values, imports, and function calls inside that DAG. +- Commits version named DAGs the same way a source-control system versions files. +- Refs are the typed identities that connect every persisted object. +- Execution turns a runnable call into another DAG, often through adapters and remote cache state. +- Storage keeps local repository objects in a content-addressed object store and leaves large external payloads behind URIs. +- Remotes publish commits, DAGs, and cache results into an S3-backed CAS-plus-refs layout. +- Codecs normalize Python values into the stored value model before they enter a DAG. + +One useful way to read the system is from outside in: + +1. You create or load a repository runtime with `Dml`. +2. You open a working DAG through `new()` or `Dml.new(...)`. +3. As you assign values or call functions, DaggerML stages nodes into an index. +4. Finishing the work produces a DAG snapshot. +5. Committing records that snapshot in history and attaches it to a branch or detached commit. +6. Optional remote sync publishes the resulting state to shared storage. + +That split is important: the working index is mutable, but the DAGs and commits it produces are not. + +Read next: + +- [DAGs and nodes](dags-and-nodes.md) for the graph model. +- [Commits and history](commits-and-history.md) for repository versioning. +- [Execution](execution.md) for what happens during function calls. diff --git a/docs/concepts/refs-and-namespaces.md b/docs/concepts/refs-and-namespaces.md new file mode 100644 index 0000000..38e1006 --- /dev/null +++ b/docs/concepts/refs-and-namespaces.md @@ -0,0 +1,56 @@ +# Refs and namespaces + +Refs are the connective tissue of DaggerML. Nearly every persisted relationship is expressed as a typed ref rather than an embedded object. + +## Ref shape + +A ref is a string-like identity in the form `namespace:id`. + +The namespace tells you what kind of object the ref is expected to resolve to. The id is the stable identity inside that namespace. + +Examples from the current model include: + +- `dag`, `commit`, `tree`, `head`, `index` +- `node-literal`, `node-fn`, `node-import`, `node-argv`, `node-kwargv` +- `datum-scalar`, `datum-list`, `datum-dict`, `datum-uri`, `datum-runnable` +- `error`, `deletable` + +## Why namespaces matter + +Namespaces are not just prefixes for display. They are part of validation and runtime safety. + +The internal type layer checks namespace expectations whenever an object says it should point at a DAG, node, datum, commit, or other stored type. Using the right namespace is how DaggerML keeps object graphs explicit and well-typed even though everything is connected through refs. + +## Refs are how objects stay shareable + +Because DAGs, commits, nodes, and data all point at each other through refs: + +- objects can be content-addressed and deduplicated +- multiple higher-level objects can share the same lower-level object +- remote sync can transfer object graphs without depending on Python object identity + +That is especially visible in two places: + +- DAGs refer to nodes and result/error refs +- commits refer to trees, and trees refer to DAGs + +## Names are different from refs + +User-facing names such as DAG names, branch names, or `Dag.names` entries are lookup handles. They are mutable labels that eventually resolve to refs. + +Refs are the durable identities underneath those labels. + +That distinction helps when reading the system: + +- names help humans navigate +- refs are what the storage model actually links together + +## How to think about it + +If you are ever unsure what an object relationship means, look for the ref and its namespace. In DaggerML, that usually tells you both the target object family and the layer boundary being crossed. + +See also: + +- [Storage](storage.md) +- [Commits and history](commits-and-history.md) +- [Remotes](remotes.md) diff --git a/docs/concepts/remotes.md b/docs/concepts/remotes.md new file mode 100644 index 0000000..c435a7d --- /dev/null +++ b/docs/concepts/remotes.md @@ -0,0 +1,57 @@ +# Remotes + +Remotes let DaggerML share repository state and execution results outside the local LMDB store. + +## The remote model + +The current remote implementation uses S3 storage with two complementary layers: + +- CAS for immutable objects, addressed by SHA-256 object id +- refs for discoverable names and mutable pointers + +This split mirrors the local design: + +- immutable content is stored by identity +- human or workflow-oriented names resolve through refs + +## What gets published + +Remote state includes a few different families of refs: + +- project branch and tag refs for git-like sync +- DAG refs for per-DAG publication and discovery +- cache refs for function-result memoization + +It also includes transport blobs under `io/invoke/` for adapter and executor boundaries. + +At the top of the remote prefix, DaggerML stores a small `dml.json` descriptor describing the `cas+refs` layout. Under that layout, immutable payloads live in `cas/sha256/...`, discoverable names live under `refs/...`, and adapter transport or execution-coordination payloads live outside CAS in `io/...` and neighboring execution-state paths. + +## Two remote roles + +The docs and code distinguish two related remote concepts: + +- `remote.root`: the storage/protocol root used for remote-backed execution and cache mutation +- `remote.project`: the project identity used for push, pull, fetch, and revision-style addressing + +That means a runtime can use remote-backed execution features without necessarily being configured for full project sync. + +## Sync and execution are related but not identical + +Push and pull move repository state between local storage and the remote CAS-plus-refs layout. + +Execution also depends on the remote when non-builtin adapters need shared cache and lifecycle state. In other words, the remote is both: + +- a publication layer for repository history, and +- a coordination layer for distributed execution + +## Integrity first + +Remote operations validate object ids, manifests, ref payloads, and path rules. DaggerML treats malformed remote state as a hard failure rather than trying to guess what the data meant. + +That validation includes the shape of manifest refs, the path rules for project branches and tags, and the one-segment cache-ref convention used for execution memoization. + +See also: + +- [Execution](execution.md) +- [Commits and history](commits-and-history.md) +- [Storage](storage.md) diff --git a/docs/concepts/storage.md b/docs/concepts/storage.md new file mode 100644 index 0000000..aa301ef --- /dev/null +++ b/docs/concepts/storage.md @@ -0,0 +1,42 @@ +# Storage + +DaggerML stores repository objects locally and refers to large external payloads indirectly. + +## Local repository storage + +The core repository store is LMDB-backed and organized around typed refs. Objects are written and read through namespaces such as `dag`, `commit`, `tree`, `node-*`, and `datum-*`. + +A few consequences fall out of that design: + +- identity is based on persisted refs, not Python object identity +- objects validate before write +- readers see complete object graphs across transaction boundaries +- shared sub-objects can be reused without copying + +## Transactions and snapshots + +Mutations happen inside explicit write transactions. Readers see valid snapshots rather than partial updates. That matches the rest of the model: indexes are mutable working state, while DAGs and commits are immutable records written from that state. + +## Reachability and garbage collection + +Because objects refer to each other through refs, DaggerML can reason about reachability. Branch heads and indexes act as the main roots for deciding which objects are still live. + +Garbage collection removes repository objects that are no longer reachable from those roots. + +## External data stays external + +Not every value should live inside the repository. `Uri` values represent external locations such as files, object storage paths, or container/image targets. + +The repository stores the reference and related bookkeeping, not the payload bytes themselves. + +For cleanup-aware flows, DaggerML also has `Deletable` records that mark URI-backed resources as eligible for removal when the surrounding graph becomes unreachable. + +## Local storage versus remote publication + +Local repository storage is the working source of truth for a runtime. Remote publication is a separate concern layered on top of it. When objects are moved across process or machine boundaries, DaggerML can dump and reload object graphs or publish them through the remote CAS-plus-refs model. + +See also: + +- [Refs and namespaces](refs-and-namespaces.md) +- [Remotes](remotes.md) +- [Codecs and values](codecs-and-values.md) diff --git a/docs/contrib/README.md b/docs/contrib/README.md new file mode 100644 index 0000000..2e3c764 --- /dev/null +++ b/docs/contrib/README.md @@ -0,0 +1,28 @@ +# Contrib + +`daggerml.contrib` is the extension layer around the core DaggerML API. It adds delayed authoring helpers, adapters and executors, plugin registries, testing helpers, dataframe codecs, and S3-backed artifact utilities. + +This section is for readers who need to build or run contrib-backed DAGs, or who need to understand how contrib extends the runtime. + +Back to the [main docs home](../README.md). + +## Start here + +- [Concepts](concepts/README.md): how delayed contrib authoring, runnable lowering, and execution backends fit together. +- [Guides](guides/README.md): practical paths for writing, testing, and running contrib workloads. +- [Reference](reference/README.md): exact Python API, runtime surfaces, registries, status output, codecs, and `S3Store` behavior. +- [Architecture](architecture/README.md): how the contrib runtime is wired internally. + +## Fast paths + +- New to contrib: read [concepts/authoring-and-runnables.md](concepts/authoring-and-runnables.md), then [guides/write-and-test-a-funk.md](guides/write-and-test-a-funk.md). +- Running work outside the local Python process: read [concepts/runtime.md](concepts/runtime.md), then [guides/run-workloads-outside-the-local-process.md](guides/run-workloads-outside-the-local-process.md). +- Checking exact adapter, executor, or plugin details: go to [reference/runtime-surfaces.md](reference/runtime-surfaces.md). +- Working on contrib internals: start from [architecture/execution-flow.md](architecture/execution-flow.md). + +## What lives in contrib + +- Authoring helpers in `daggerml.contrib.api`: `funkify`, `dagclass`, `run`, `ref`, and `load`. +- Adapters in `daggerml.contrib.adapters`: the built-in `local` and `lambda` adapter entrypoints. +- Executors in `daggerml.contrib.executors`: `script`, `docker`, `ssh`, `batch`, and `cfn`. +- Utilities in `daggerml.contrib.s3`, `daggerml.contrib.codecs`, `daggerml.contrib.funks`, `daggerml.contrib.testing`, and `daggerml.contrib.status`. diff --git a/docs/contrib/architecture/README.md b/docs/contrib/architecture/README.md new file mode 100644 index 0000000..f0a8b24 --- /dev/null +++ b/docs/contrib/architecture/README.md @@ -0,0 +1,8 @@ +# Contrib Architecture + +These pages are for readers who want the internal runtime picture rather than just the public surfaces. + +- [Execution flow](execution-flow.md): how delayed authoring becomes adapter and executor work. +- [Supervisor and state](supervisor-and-state.md): how the script supervisor, runtime-owned state, and S3 handoff fit together. + +Back to [contrib home](../README.md). diff --git a/docs/contrib/architecture/execution-flow.md b/docs/contrib/architecture/execution-flow.md new file mode 100644 index 0000000..713195d --- /dev/null +++ b/docs/contrib/architecture/execution-flow.md @@ -0,0 +1,64 @@ +# Execution Flow + +This is the high-level path from a contrib-authored value to a terminal result. + +## 1. Authoring builds delayed values + +The process starts in Python code with values such as: + +- `DelayedRunnable` +- `DelayedRef` +- `DelayedLoad` +- compiled `dagclass` members + +At this point, contrib is still describing work, not executing it. + +## 2. DAG staging lowers delayed values + +When the DAG stages values, contrib lowering resolves: + +- local refs through the current DAG namespace, +- external loads through committed DAG state, +- delayed runnables through the selected adapter's `resolve_runnable(...)` method. + +That is where contrib turns declarative wrappers into concrete `Runnable` objects with concrete adapters and targets. + +## 3. The adapter boundary receives a canonical payload + +The adapter is given: + +- the `Runnable` +- the `argv_ptr` +- `cache_key` +- `execution_id` +- `remote` +- optional persisted `state` +- optional lifecycle fields such as `execution_status` + +Adapters are expected to perform one bounded step and return canonical JSON-compatible output. + +## 4. Executors handle backend-specific behavior + +For the built-in local path, `LocalAdapter.send(...)` looks up the executor by `(adapter="local", runnable.target.uri)` and delegates to `spec.handle(...)`. + +`ExecutorBase.handle(...)` then decides whether to: + +- call `start(...)` for a first launch, +- call `poll(...)` for a resumed launch, +- call `cancel(...)` when cancellation is pending. + +That shared control flow is why detached backends can still fit the same runtime model as synchronous ones. + +## 5. The runtime publishes the terminal result + +Executors do not publish final cache entries themselves. They return terminal status to the runtime, and the runtime-owned execution path publishes cache or failure results after observing `succeeded` or `failed`. + +This keeps cache publication centralized even when the actual work happened in another process, a container, a remote machine, or a cloud service. + +## Consequences for contrib authors + +- Wrapper order matters because each wrapper changes the next lowering step. +- The innermost script callable must be self-contained because it is serialized and replayed elsewhere. +- Detached executors must return durable first-launch state because later polls may happen in a different process. + +For the lower-level state and supervisor details, continue to [supervisor and state](supervisor-and-state.md). diff --git a/docs/contrib/architecture/supervisor-and-state.md b/docs/contrib/architecture/supervisor-and-state.md new file mode 100644 index 0000000..55f4e38 --- /dev/null +++ b/docs/contrib/architecture/supervisor-and-state.md @@ -0,0 +1,59 @@ +# Supervisor And State + +Two pieces make contrib runtime execution resumable in practice: the script supervisor and runtime-owned remote state. + +## Script supervisor + +The `script` executor starts `python -m daggerml.contrib.supervisor` in a detached process. + +The supervisor then: + +- validates the payload, +- creates a temporary workdir, +- initializes a worker repo under that workdir, +- launches the actual worker command, +- collects `stdout.log`, `stderr.log`, and `result.json`, +- streams stdout and stderr to CloudWatch best-effort. + +The worker result must be terminal after process exit: + +- success requires `status`, `error`, and a real `dag_id`, +- failure requires `status` and `error`, +- non-terminal worker results are rejected once the worker has exited. + +## Runtime-owned state model + +Contrib no longer treats executor instances as the owners of live state. The runtime stores execution coordination under the configured remote root. + +The current model revolves around: + +- a mutex per `cache_key`, +- an active execution pointer for the current `cache_key`, +- launch state for a specific `execution_id`, +- execution records for lifecycle and cancellation tracking. + +This is why later polls can resume with immutable launch-time state instead of depending on the launching process to stay alive. + +## `ExecutionState.adapter_io(...)` + +Detached backends such as Docker and Batch need stable transport locations for a nested adapter payload. + +`adapter_io(exec_id, name)` derives those locations from: + +- `cache_key` +- `execution_id` +- a caller-chosen name such as `local:docker` or `lambda:batch` + +The executor can then: + +- write the nested input once, +- pass the input and output URIs to the remote worker, +- reconstruct the same paths during `poll(...)` without storing them in executor state. + +## Why this split matters + +- The supervisor isolates author code from the launcher process. +- Remote state keeps detached backends resumable and deduplicated. +- Centralized runtime ownership prevents adapters and executors from publishing terminal cache state independently. + +See also: [execution flow](execution-flow.md) diff --git a/docs/contrib/concepts/README.md b/docs/contrib/concepts/README.md new file mode 100644 index 0000000..ff931c4 --- /dev/null +++ b/docs/contrib/concepts/README.md @@ -0,0 +1,8 @@ +# Contrib Concepts + +These pages explain the mental model behind `daggerml.contrib`. + +- [Authoring and runnables](authoring-and-runnables.md): how `funkify`, delayed values, and `dagclass` definitions become concrete runtime work. +- [Runtime model](runtime.md): how adapters, executors, state, and external backends cooperate while a contrib call is running. + +Back to [contrib home](../README.md). diff --git a/docs/contrib/concepts/authoring-and-runnables.md b/docs/contrib/concepts/authoring-and-runnables.md new file mode 100644 index 0000000..8619d12 --- /dev/null +++ b/docs/contrib/concepts/authoring-and-runnables.md @@ -0,0 +1,72 @@ +# Authoring And Runnables + +Most contrib features exist to let you describe work now and choose where it runs later. + +## The core idea + +Core DaggerML gives you DAGs, nodes, refs, and commits. Contrib adds a second layer for authoring runnable work: + +- `api.funkify(...)` wraps a callable or sub-runnable as a `DelayedRunnable`. +- `api.ref(name)` and `api.load(dagname, nodename=None)` create delayed references that are resolved when a DAG is staged. +- `api.dagclass` lets you declare a DAG as a Python class, then compiles its members into delayed values. +- `api.run(instance, ...)` materializes that compiled class into a fresh DAG and commits the selected entrypoint result. + +The important boundary is that these helpers are declarative until the DAG normalizes values into concrete `Runnable` objects. + +## `funkify` is a lowering step, not an execution step + +`funkify` does not run your function. It records intent: + +- which adapter to use, +- which executor or target URI to aim at, +- any wrapper chain around a sub-runnable, +- any delayed values in kwargs. + +When the DAG later stages the value, contrib resolves the selected adapter from the adapter registry and asks it to produce a concrete `Runnable`. + +That means nested wrappers compose naturally. A function can be script-backed first, then wrapped for Docker, then wrapped again for SSH. + +## Script-backed functions are serialized source + +The default contrib path is `@api.funkify(uri="script", adapter="local")`. + +That path serializes the function source into an S3-backed script artifact and records metadata such as: + +- the function name, +- defaulted call kwargs, +- any `prepop` values, +- any extra helper objects or source lines you injected. + +The worker only gets the serialized function source plus `extra_objs` and `extra_lines`. It does not get your module globals for free. If a script-backed function needs an import at runtime, import inside the function body or inject the dependency explicitly. + +## `dagclass` turns a class into a DAG recipe + +`@api.dagclass` compiles an instance at `__init__` time. + +During compilation, contrib: + +- collects fields and plain attributes, +- compiles plain methods into script-backed delayed runnables, +- infers member dependencies from `self.` reads in plain methods, +- builds a dependency graph across members, +- topologically orders the members for later materialization. + +`api.run(...)` then creates a DAG, inserts each compiled member by name, calls the chosen entrypoint, and commits the result. + +Use `dagclass` when you want a reusable DAG definition with explicit member names and dependencies. Use plain `funkify` when a direct callable wrapper is enough. + +## Delayed values stay in the same namespace + +`api.ref("name")` is for local DAG references. Inside a `dagclass`, it refers to the member namespace that `api.run(...)` later materializes. + +`api.load(...)` is for loading from another committed DAG. It participates in lowering, but it does not create a local member-ordering dependency the way `ref(...)` does. + +## Where prebuilt helpers fit + +Contrib also ships a few reusable helpers on top of the same model: + +- `daggerml.contrib.funks.docker_build` is a prebuilt script-backed delayed runnable for building container images. +- `daggerml.contrib.testing.defunkify(...)` peels a delayed runnable back to the innermost script callable for author-code unit tests. +- `daggerml.contrib.testing.MockNode` gives those tests the minimal `.value()` behavior many contrib callables expect. + +Next: [Runtime model](runtime.md) diff --git a/docs/contrib/concepts/runtime.md b/docs/contrib/concepts/runtime.md new file mode 100644 index 0000000..359642f --- /dev/null +++ b/docs/contrib/concepts/runtime.md @@ -0,0 +1,85 @@ +# Runtime Model + +Contrib runtime execution is split between adapters, executors, and runtime-owned state. + +## Adapters choose the boundary + +An adapter is the outer transport boundary. The built-in adapters are: + +- `local`: dispatches to a local executor by calling its `handle(...)` path. +- `lambda`: sends the runtime payload to an AWS Lambda function and expects canonical JSON back. + +Adapters are responsible for: + +- parsing or emitting the adapter payload, +- selecting the concrete executor target, +- performing one bounded runtime step, +- returning one of the canonical result shapes. + +The canonical statuses are: + +- `running` +- `succeeded` +- `failed` +- `cancel-detached` + +## Executors own execution behavior + +Executors implement the actual backend behavior after adapter routing. + +Built-in executors in this repo are: + +- `script`: runs serialized Python through the contrib supervisor. +- `docker`: runs a nested adapter inside a Docker container. +- `ssh`: runs a nested adapter synchronously over SSH. +- `batch`: submits a nested adapter run to AWS Batch through Lambda. +- `cfn`: creates or updates an AWS CloudFormation stack and turns terminal outputs back into a DAG result. + +Some executors are synchronous in practice, but the runtime still treats execution in terms of `start`, `poll`, and `cleanup` behavior. + +## State belongs to the runtime, not the executor + +The runtime coordinates resumable work around two identifiers: + +- `cache_key`: identifies the computation and the active execution slot. +- `execution_id`: identifies one execution attempt. + +Runtime-owned state lives under the configured remote root. The main pieces are: + +- an advisory mutex for a `cache_key`, +- an active execution pointer for a `cache_key`, +- launch state for a specific `execution_id`, +- execution records for a specific `execution_id`. + +Executors return durable launch-time state in the first `running` result, and later polls resume from that immutable state. + +## Fire-and-monitor backends use S3 handoff + +`docker` and `batch` cannot rely on direct stdin and stdout piping once the child process is remote or detached. They use `ExecutionState.adapter_io(...)` to derive stable S3 input and output locations from `(cache_key, execution_id, name)`. + +That lets `start(...)` and `poll(...)` agree on the same payload locations without storing those URIs in executor state. + +## Script execution uses a supervisor + +The `script` executor does not run your serialized function directly in the launching process. + +Instead it: + +- writes a supervisor payload, +- starts `python -m daggerml.contrib.supervisor` in a detached process, +- lets the supervisor create an isolated repo and workdir, +- runs the script worker there, +- collects `result.json`, `stdout.log`, and `stderr.log`. + +The supervisor also streams worker stdout and stderr to CloudWatch on a best-effort basis while preserving the local log files. + +## Registries keep the system open-ended + +Adapters and executors are discovered from Python entry points as well as runtime registration calls. + +- Adapter entry point group: `daggerml.contrib.adapters` +- Executor entry point group: `daggerml.contrib.executors` + +That is why contrib can expose a small built-in catalog while still acting as a plugin surface. + +Next: [Run workloads outside the local process](../guides/run-workloads-outside-the-local-process.md) diff --git a/docs/contrib/guides/README.md b/docs/contrib/guides/README.md new file mode 100644 index 0000000..5490b5d --- /dev/null +++ b/docs/contrib/guides/README.md @@ -0,0 +1,8 @@ +# Contrib Guides + +These guides are task-oriented and use the real contrib surfaces in this repository. + +- [Write and test a funk](write-and-test-a-funk.md): start with `funkify`, then unit test the innermost script callable. +- [Run workloads outside the local process](run-workloads-outside-the-local-process.md): wrap work for Docker, SSH, Batch, or CloudFormation. + +Back to [contrib home](../README.md). diff --git a/docs/contrib/guides/run-workloads-outside-the-local-process.md b/docs/contrib/guides/run-workloads-outside-the-local-process.md new file mode 100644 index 0000000..5e6b2ca --- /dev/null +++ b/docs/contrib/guides/run-workloads-outside-the-local-process.md @@ -0,0 +1,65 @@ +# Run Workloads Outside The Local Process + +Contrib wrappers compose, so the same logical function can be pushed through different backends. + +## Docker + +The repository example `examples/01-docker_dataset.py` shows the full pattern: + +1. Build a container image with `daggerml.contrib.funks.docker_build`. +2. Wrap a function with `@api.funkify(uri="docker", image=...)`. +3. Keep the innermost function script-backed so the container runs a nested adapter. + +Typical shape: + +```python +@api.funkify(uri="docker", image=api.ref("image"), flags=api.ref("dkr-flags")) +@api.funkify +def download_dataset(dag): + from sklearn.datasets import load_iris + return load_iris(as_frame=True).frame.dropna() +``` + +Use Docker when the worker needs dependencies that you do not want in the local Python environment. + +## SSH + +`examples/02-ssh_docker_dataset.py` adds an SSH wrapper around the Docker-backed function: + +```python +@api.funkify(uri="ssh", adapter="local", host=..., flags=..., env_files=...) +@api.funkify(uri="docker", image=api.ref("image"), flags=api.ref("dkr-flags")) +@api.funkify +def predict_target(dag, dataset, params): + ... +``` + +The SSH executor opens one SSH session, sources each `env_file`, and runs the nested adapter with `--poll`. It does not create a separate remote wrapper script or a contrib-managed remote workdir. + +## Batch + +Use the `batch` executor when the nested work should run in AWS Batch but the orchestration boundary stays Lambda-based. + +You supply: + +- `lambda_uri` +- `image` +- optional `cpu` +- optional `memory` +- optional `gpu` + +At runtime, the Lambda-side executor writes the nested adapter payload to S3, submits a Batch job, and later polls Batch for completion. + +## CloudFormation + +Use `cfn` when the result you want is a stack operation rather than a generic worker process. The executor creates or updates the stack, polls for terminal status, and commits stack outputs back as a DAG result. + +## Choosing a backend + +- Use `script` for the simplest local or subprocess-backed path. +- Use `docker` when the environment is the main problem. +- Use `ssh` when the machine boundary matters. +- Use `batch` when you need queued container execution in AWS. +- Use `cfn` when the workflow itself is infrastructure provisioning. + +For exact kwargs and runtime behavior, see [reference/runtime-surfaces.md](../reference/runtime-surfaces.md). diff --git a/docs/contrib/guides/write-and-test-a-funk.md b/docs/contrib/guides/write-and-test-a-funk.md new file mode 100644 index 0000000..535ecf1 --- /dev/null +++ b/docs/contrib/guides/write-and-test-a-funk.md @@ -0,0 +1,71 @@ +# Write And Test A Funk + +This is the shortest path from a Python function to a contrib-backed DAG call. + +## 1. Start with a script-backed function + +```python +from daggerml.contrib import api + +@api.funkify(uri="script", adapter="local") +def hello(dag, arg): + from uuid import uuid4 + + return f"{uuid4() = !s} and {arg.value() = }." +``` + +This is the same pattern used in `examples/00-hello_world.py`. + +## 2. Call it from a DAG + +```python +import daggerml as dml + +with dml.new(name="examples/00-hello-world") as dag: + dag.hello_fn = hello + result = dag.call(hello, 23, name="greeting") + dag.commit(result) +``` + +`arg` is node-like at runtime, so contrib author code usually reads inputs through `.value()`. + +## 3. Keep script workers self-contained + +The script worker only sees the serialized function source and anything you inject through `extra_objs` or `extra_lines`. + +Good pattern: + +- import dependencies inside the function body, +- or pass helper definitions through `extra_objs`, +- or inject source lines explicitly. + +Risky pattern: + +- relying on a module-level import or global constant that exists only in the author process. + +## 4. Unit test the innermost callable with `defunkify` + +```python +from daggerml.contrib.testing import defunkify + +call = defunkify(hello) +assert call(None, 23) is not None +``` + +`defunkify(...)` walks to the innermost script runnable, returns the original callable, wraps non-leading arguments as node-like values, and runs the test in an isolated temporary working directory. + +## 5. Use `MockNode` when you want explicit node-like values + +```python +from daggerml.contrib.testing import MockNode + +assert call(None, MockNode(23)) is not None +``` + +`MockNode` is intentionally small. It only gives you `.value()`. If a test needs real DAG, ref, or persistence behavior, switch to repository-backed APIs instead. + +## 6. Reach for `dagclass` when the workflow has named members + +Use `@api.dagclass` when you want a reusable DAG recipe with internal references and a named entrypoint. `api.run(instance, ...)` will create the DAG, materialize members, call the entrypoint, and commit the result. + +See also: [reference/python-api.md](../reference/python-api.md) diff --git a/docs/contrib/reference/README.md b/docs/contrib/reference/README.md new file mode 100644 index 0000000..1425f0d --- /dev/null +++ b/docs/contrib/reference/README.md @@ -0,0 +1,9 @@ +# Contrib Reference + +These pages describe the exact surfaces exposed by `daggerml.contrib`. + +- [Python API](python-api.md): `funkify`, `dagclass`, `run`, delayed refs and loads, testing helpers, and prebuilt funks. +- [Runtime surfaces](runtime-surfaces.md): adapters, executors, registries, and status output. +- [S3 and codecs](s3-and-codecs.md): `S3Store`, `is_s3_uri`, and the built-in dataframe codecs. + +Back to [contrib home](../README.md). diff --git a/docs/contrib/reference/python-api.md b/docs/contrib/reference/python-api.md new file mode 100644 index 0000000..7ec3f98 --- /dev/null +++ b/docs/contrib/reference/python-api.md @@ -0,0 +1,96 @@ +# Python API + +## Authoring helpers + +`daggerml.contrib.api` exposes the main contrib authoring surface. + +| Surface | What it returns | Notes | +| --- | --- | --- | +| `api.funkify(...)` | `DelayedRunnable` | Supports decorator and wrapper forms. Default target is `uri="script"`, `adapter="local"`. | +| `api.ref(name)` | `DelayedRef` | Refers to another node in the same DAG namespace. | +| `api.load(dagname, nodename=None)` | `DelayedLoad` | Loads from another committed DAG during staging. | +| `@api.dagclass(...)` | compiled class | Compiles fields and methods into a DAG recipe at instance init time. | +| `api.run(instance, ...)` | `None` | Materializes the compiled class into a DAG, calls the entrypoint, and commits the result. | + +## `funkify` + +`funkify` accepts either: + +- a callable, +- a `Runnable`, +- a `DelayedRunnable`, +- or no positional input yet, in decorator-builder form. + +Important behavior: + +- callable input is stored as `kwargs["fn"]` on the delayed runnable, +- wrapper input preserves the nested `sub` runnable chain, +- lowering happens later through the adapter registry, +- delayed refs and loads inside kwargs are resolved during DAG normalization. + +For script-backed callables: + +- the first parameter must be `dag`, +- defaulted parameters become call kwargs recorded on the runnable, +- unknown script kwargs are rejected, +- the generated script must parse as valid Python, +- the function must be globally definable in the rendered source. + +## `dagclass` and `run` + +`dagclass` is for class-shaped DAG definitions. + +Key rules from the current implementation: + +- compilation happens when the instance is created, +- plain methods are lowered through `funkify(..., uri="script", adapter="local")`, +- dependency inference watches `self.` reads inside plain methods, +- member cycles and unknown member references fail before execution, +- reserved member names include `dag`, `dml`, `argv`, `call`, `put`, and `commit`. + +`api.run(instance, ..., entrypoint=None, name=None)`: + +- requires a compiled dagclass instance, +- resolves the entrypoint from the explicit argument or the class default, +- inserts compiled members into the DAG under their original names, +- calls the entrypoint and commits the result. + +## Testing helpers + +`daggerml.contrib.testing` exposes: + +- `MockNode(value)`: minimal node-like wrapper with `.value()`. +- `MockNode.from_value(value)`: preserves real `Node` and `MockNode` instances. +- `defunkify(delayed)`: unwraps to the innermost script callable and runs it in an isolated temporary workdir. + +`defunkify(...)` is only for delayed runnable chains whose innermost runnable is script-backed and still retains a callable in `kwargs["fn"]`. + +## Prebuilt funks + +`daggerml.contrib.funks` currently exports `docker_build`. + +Effective call shape: + +```python +docker_build(context_tarball, build_flags=(), repo=None) +``` + +Behavior: + +- untars the build context through `S3Store`, +- runs `docker build`, +- returns an S3 tar `Uri` by default, +- tags and pushes to `repo` when `repo` is provided. + +## Status API + +`daggerml.contrib.status.status()` returns a JSON-safe report with: + +- `schema_version` +- `summary` +- `adapters` +- `executors` +- `codecs` +- `diagnostics` + +Use it when you need structured introspection of effective contrib registrations instead of ad hoc printing. diff --git a/docs/contrib/reference/runtime-surfaces.md b/docs/contrib/reference/runtime-surfaces.md new file mode 100644 index 0000000..c3cde37 --- /dev/null +++ b/docs/contrib/reference/runtime-surfaces.md @@ -0,0 +1,116 @@ +# Runtime Surfaces + +## Built-in adapters + +| Adapter | Registry name | Executable | Behavior | +| --- | --- | --- | --- | +| Local adapter | `local` | `dml-local-adapter` | Calls a local executor's `handle(...)` method. | +| Lambda adapter | `lambda` | `dml-lambda-adapter` | Invokes an AWS Lambda function and validates canonical JSON output. | + +All adapters exchange the same logical payload fields: + +- `runnable` +- `argv_ptr` +- `cache_key` +- `execution_id` +- `remote` +- `state` +- `execution_status` +- `cancel_requested_by` + +Canonical adapter results are: + +- `{"status": "running", "error": null, "state": {...}}` +- `{"status": "succeeded", "error": null, "dag_id": ""}` +- `{"status": "failed", "error": "..."}` +- `{"status": "cancel-detached", "error": null}` + +## Built-in executors + +### `script` + +- Adapter: `local` +- Requires `sub is None` +- Accepted kwargs: `fn`, `prepop`, `extra_objs`, `extra_lines` +- Serializes the callable source into S3 and runs it through the contrib supervisor + +### `docker` + +- Adapter: `local` +- Requires a nested `sub` runnable +- Accepted kwargs: `image`, optional `flags` +- Starts a detached container, then polls container state and reads the nested adapter result from S3 + +### `ssh` + +- Adapter: `local` +- Requires a nested `sub` runnable +- Accepted kwargs: `host`, optional `flags`, optional `env_files` +- Runs the nested adapter synchronously over SSH + +### `batch` + +- Adapter: `lambda` +- Requires a nested `sub` runnable +- Accepted kwargs: `lambda_uri`, `image`, optional `cpu`, optional `memory`, optional `gpu` +- Uses Lambda as the adapter boundary and AWS Batch as the executor backend + +### `cfn` + +- Adapter: `local` +- Accepts CloudFormation-oriented stack data +- Creates or updates a stack, then commits outputs back into a DAG result + +## Registries and plugin discovery + +Contrib keeps adapters and executors in separate registries. + +| Registry | Module | Entry point group | Lookup key | +| --- | --- | --- | --- | +| Adapter registry | `daggerml.contrib.adapter_registry` | `daggerml.contrib.adapters` | adapter name | +| Executor registry | `daggerml.contrib.executor_registry` | `daggerml.contrib.executors` | `(adapter, executor)` | + +Plugin entry points may return: + +- one registration object, +- an iterable of registration objects, +- or a callable that returns either of those. + +Registration objects are validated before being accepted. + +## `status()` report + +`daggerml.contrib.status.status()` returns one JSON-safe snapshot of: + +- effective adapters, +- effective executors, +- registered codecs, +- duplicate-key warnings, +- plugin load or validation failures. + +Its top-level schema is currently: + +```python +{ + "schema_version": 0, + "summary": {...}, + "adapters": [...], + "executors": [...], + "codecs": [...], + "diagnostics": [...], +} +``` + +The report is meant for structured introspection, not for human-friendly formatting. + +## Execution-state helpers + +Contrib runtime state is coordinated outside the adapter and executor objects themselves. + +Important pieces: + +- the runtime acquires a mutex per `cache_key`, +- launch state and execution records are stored under the remote root, +- `ExecutionState.adapter_io(...)` derives stable S3 input and output paths for detached backends like Docker and Batch. + +For the internal flow behind those helpers, see [../architecture/supervisor-and-state.md](../architecture/supervisor-and-state.md). diff --git a/docs/contrib/reference/s3-and-codecs.md b/docs/contrib/reference/s3-and-codecs.md new file mode 100644 index 0000000..6028342 --- /dev/null +++ b/docs/contrib/reference/s3-and-codecs.md @@ -0,0 +1,57 @@ +# S3 And Codecs + +## `S3Store` + +`daggerml.contrib.s3.S3Store` is the contrib utility for storing external payloads in S3-backed object storage. + +When you construct it without explicit `bucket` and `prefix`, it reads `remote.root` from the active DaggerML config and derives a data prefix at `/data`. + +That keeps contrib artifact storage separate from the repository-managed `dml/` protocol namespace. + +## Main `S3Store` operations + +| Method | Purpose | +| --- | --- | +| `parse_uri(...)` | Normalize a name, `Uri`, or node-like value into `(bucket, key)`. | +| `put(...)` | Content-addressed write of bytes or a local file. | +| `get(...)` | Read raw bytes. | +| `exists(...)` | Existence check using `head_object`. | +| `ls(...)` | List object URIs from the current prefix or a supplied root. | +| `rm(...)` | Delete one or more objects. | +| `put_js(...)` / `get_js(...)` | JSON helpers. | +| `tar(...)` / `untar(...)` | Archive a local directory or unpack a stored tarball. | +| `cd(...)` | Rebase to a different prefix while preserving the client and bucket. | + +Important behavior: + +- writes are content-addressed by `sha256(payload_bytes) + suffix`, +- `tar(...)` normalizes archive metadata for deterministic output, +- `untar(..., unsafe=False)` rejects absolute paths and destination-escaping members, +- `is_s3_uri(value)` only returns `True` for non-empty `s3://bucket/key` values. + +## Built-in contrib codecs + +`daggerml.contrib.codecs.literal_codecs()` returns the built-in contrib dataframe codecs that are available in the current process. + +Current catalog: + +- pandas `DataFrame` codec +- polars `DataFrame` codec + +Both codecs: + +- match only their own dataframe type, +- serialize to parquet bytes, +- publish those bytes through `S3Store.put(..., suffix=".parquet")`, +- return an external `Uri` rather than in-repo literal storage. + +If the optional backend library is not installed, that codec simply does not appear in `literal_codecs()`. + +## Where these surfaces show up + +- `docker_build` uses `S3Store` for build contexts and image tarballs. +- `script` uses `S3Store` for serialized script artifacts. +- `docker` may load an image tar from an S3 `Uri`. +- dataframe values can be externalized automatically through the contrib codecs. + +See also: [runtime surfaces](runtime-surfaces.md) diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..d665ca5 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,61 @@ +# Getting Started + +Use this page to get a local DaggerML repo running, create one DAG, and inspect it from the CLI. + +## Install + +DaggerML requires Python 3.10+. + +Install `daggerml` in whichever Python environment you want to use: + +```bash +pip install "daggerml" +``` + +## Initialize A Local Repo + +`dml init` initializes an existing directory, so create one first: + +```bash +mkdir demo && cd demo +dml init +``` + +That creates `./demo/.dml/` with the local database and config. + +## Create Your First DAG + +Run the following code in any Python environment you like, such as a script, a notebook, or a REPL. + +```python +import daggerml as dml + +dag = dml.new(name="hello", message="add hello dag") +result = dag.put({"message": "hello", "value": 42}, name="result") +dag.put([1, 2, 3], name="inputs") +dag.commit(result) +``` + +This creates a committed DAG named `hello` on the local `main` branch. + +## Inspect It + +```bash +dml status +dml dag list +dml dag get hello +dml show +``` + +Use these commands to answer the first questions you usually have: + +- `status`: current HEAD, branches, visible DAGs, and open indexes +- `dag list`: DAG names at the selected revision +- `dag get hello`: the stored DAG payload, including named nodes +- `show`: the current commit plus DAG-level changes from its parent + +## Next Steps + +- Read [reference/cli.md](reference/cli.md) for the generated command surface. +- Read [reference/python-api.md](reference/python-api.md) for the Python entrypoints: `Dml`, `new()`, and `load()`. +- Read [concepts/dags-and-nodes.md](concepts/dags-and-nodes.md) and [concepts/commits-and-history.md](concepts/commits-and-history.md) for the model behind what you just created. diff --git a/docs/guides/README.md b/docs/guides/README.md new file mode 100644 index 0000000..a20e8e8 --- /dev/null +++ b/docs/guides/README.md @@ -0,0 +1,15 @@ +# Guides + +Use this section for task-oriented walkthroughs built around the current DaggerML CLI and Python API. + +These pages stay focused on the steps. For deeper background, follow the links into [concepts](../concepts/README.md) and [reference](../reference/README.md). + +The command examples use `dml` directly. If you are running from a repository checkout instead of an installed CLI, prefix those commands with `uv run`. + +## Workflows + +- [Create and run a DAG](create-and-run-a-dag.md): initialize a repo, build a DAG in Python, and inspect the result from the CLI. +- [Inspect a repository](inspect-a-repository.md): check HEAD, branches, DAGs, commits, and revision-to-revision changes. +- [Work with remotes](work-with-remotes.md): configure `remote.root` and `remote.project`, inspect remote refs, and sync with `fetch`, `pull`, and `push`. +- [Store and load external data](store-and-load-external-data.md): keep large bytes in S3 while committing `Uri` references into DAG state. +- [Troubleshoot common errors](troubleshoot-common-errors.md): fix the most common setup, sync, and DAG-authoring failures. diff --git a/docs/guides/create-and-run-a-dag.md b/docs/guides/create-and-run-a-dag.md new file mode 100644 index 0000000..e08fd12 --- /dev/null +++ b/docs/guides/create-and-run-a-dag.md @@ -0,0 +1,74 @@ +# Create and run a DAG + +This is the smallest end-to-end DaggerML workflow that is easy to verify locally: initialize a repo, create a DAG in Python, commit the result, then inspect it from the CLI. + +## 1. Initialize a repo + +```bash +dml init --project-home ./demo-repo --user alice@example.com +``` + +That creates `.dml/` state under `./demo-repo` and leaves `HEAD` attached to `main`. + +## 2. Create and commit a DAG in Python + +```python +from daggerml import Dml, new + +dml = Dml(project_home="./demo-repo", user="alice@example.com") + +with new("numbers", message="create numbers dag", dml=dml) as dag: + left = dag.put(2, name="left") + right = dag.put(3, name="right") + result = dag.put({"sum": left.value() + right.value()}, name="result") + dag.commit(result) +``` + +Two practical details from the current API: + +- Name the final node explicitly if you want a stable `result` entry in the committed DAG. +- Commit with `dag.commit(result)`. `Dag.result` is a read surface on committed DAGs, not a writable property. + +## 3. Inspect the result from the CLI + +```bash +dml --project-home ./demo-repo status +dml --project-home ./demo-repo dag list +dml --project-home ./demo-repo dag get numbers +dml --project-home ./demo-repo show +``` + +Typical uses: + +- `status` shows the attached branch, current commit, visible DAGs, and open indexes. +- `dag list` shows which DAG names exist at the selected revision. +- `dag get numbers` shows the committed DAG payload, including named nodes. +- `show` summarizes the current commit and the DAG-level change from its first parent. + +## 4. Run a callable node when you need execution + +The current Python API also supports `dag.call(...)` and `RunnableNode(...)` execution. A repo-backed example used in this codebase's tests looks like this: + +```python +from pathlib import Path + +from daggerml import Dml, Runnable, Uri, new + +adapter = str(Path("tests/assets/internal_fn/python-fork-adapter.py").resolve()) +fn = Runnable(target=Uri("./tests/assets/fns/sum.py"), adapter=adapter, kwargs={"x": 10}) + +dml = Dml(project_home="./demo-repo", user="alice@example.com") + +with new("sum", message="run sum", dml=dml) as dag: + total = dag.call(fn, 1, 2, 3, name="total") + dag.commit(total) +``` + +Non-builtin execution may need remote runtime context, depending on the adapter you use. + +## Related docs + +- [DAGs and nodes](../concepts/dags-and-nodes.md) +- [Python API](../reference/python-api.md) +- [CLI](../reference/cli.md) +- [Errors](../reference/errors.md) diff --git a/docs/guides/inspect-a-repository.md b/docs/guides/inspect-a-repository.md new file mode 100644 index 0000000..f2a7727 --- /dev/null +++ b/docs/guides/inspect-a-repository.md @@ -0,0 +1,83 @@ +# Inspect a repository + +Use the CLI when you want a quick JSON view of repo state, and use the Python API when you want the same information inside any Python environment, such as a script, a notebook, or a REPL. + +## Check the current checkout + +```bash +dml --project-home ./demo-repo status +dml --project-home ./demo-repo branch +``` + +Use `status` for the full picture and `branch` when you only care about branch names and the current attached head. + +Python equivalent: + +```python +from daggerml import Dml + +dml = Dml(project_home="./demo-repo") + +print(dml.status()) +print(dml.branch()) +``` + +## Look at commits and changes + +```bash +dml --project-home ./demo-repo log --limit 5 +dml --project-home ./demo-repo show +dml --project-home ./demo-repo diff --left HEAD~1 --right HEAD +``` + +Notes for the generated CLI: + +- `log` takes `--limit` because `limit` is an optional method parameter. +- `diff` uses `--left` and `--right` for the same reason. + +Python equivalent: + +```python +from daggerml import Dml + +dml = Dml(project_home="./demo-repo") + +print(dml.log(limit=5)) +print(dml.show()) +print(dml.diff("HEAD~1", "HEAD")) +``` + +## Inspect DAGs at a revision + +```bash +dml --project-home ./demo-repo dag list +dml --project-home ./demo-repo dag get numbers +``` + +Use `dag list` to discover names, then `dag get` when you want the DAG payload with named nodes. + +Python equivalent: + +```python +from daggerml import Dml + +dml = Dml(project_home="./demo-repo") + +print(dml.dag.list()) +print(dml.dag.get("numbers")) +``` + +## Inspect older revisions + +Most repo and DAG inspection surfaces take revision selectors such as `HEAD`, `HEAD~1`, `main`, or `origin/main`. + +```bash +dml --project-home ./demo-repo dag list --revision HEAD~1 +dml --project-home ./demo-repo log --limit 1 +``` + +## Related docs + +- [Commits and history](../concepts/commits-and-history.md) +- [CLI](../reference/cli.md) +- [Python API](../reference/python-api.md) diff --git a/docs/guides/store-and-load-external-data.md b/docs/guides/store-and-load-external-data.md new file mode 100644 index 0000000..d7c845f --- /dev/null +++ b/docs/guides/store-and-load-external-data.md @@ -0,0 +1,69 @@ +# Store and load external data + +Use DaggerML to track references to large external artifacts, not to inline the bytes into repository state. The current repo support for this lives in `Uri` values and `daggerml.contrib.s3.S3Store`. + +## 1. Create a store + +If your repo already has `remote.root` configured, `S3Store()` uses that root automatically. + +```python +from daggerml.contrib.s3 import S3Store + +store = S3Store() +``` + +If you want to be explicit, construct the store from an S3 remote root: + +```python +from daggerml.contrib.s3 import S3Store + +store = S3Store.from_remote_root("s3://bucket/prefix") +``` + +## 2. Upload bytes and commit the resulting `Uri` + +```python +from daggerml import Dml, new +from daggerml.contrib.s3 import S3Store + +dml = Dml(project_home="./demo-repo", remote_root="s3://bucket/prefix", user="alice@example.com") +store = S3Store.from_remote_root("s3://bucket/prefix") + +artifact_uri = store.put(data=b"hello world", suffix=".txt") + +with new("artifacts", message="store external data", dml=dml) as dag: + result = dag.put(artifact_uri, name="result") + dag.commit(result) +``` + +The DAG now stores a `Uri`, while the payload bytes stay in S3. + +## 3. Load the `Uri` later and read the payload + +```python +from daggerml import Dml, load +from daggerml.contrib.s3 import S3Store + +dml = Dml(project_home="./demo-repo", remote_root="s3://bucket/prefix") +store = S3Store.from_remote_root("s3://bucket/prefix") + +artifact_uri = load("artifacts", dml=dml).result.value() +payload = store.get(artifact_uri) +``` + +## 4. Common follow-up operations + +```python +listed = store.ls(recursive=True) +exists = store.exists(artifact_uri) +store.rm(artifact_uri) +``` + +`S3Store` also supports JSON helpers with `put_js()` and `get_js()`, plus tarball upload and extraction with `tar()` and `untar()`. + +## Related docs + +- [Storage](../concepts/storage.md) +- [Codecs and values](../concepts/codecs-and-values.md) +- [Python API](../reference/python-api.md) +- [Errors](../reference/errors.md) diff --git a/docs/guides/troubleshoot-common-errors.md b/docs/guides/troubleshoot-common-errors.md new file mode 100644 index 0000000..6ee53b0 --- /dev/null +++ b/docs/guides/troubleshoot-common-errors.md @@ -0,0 +1,85 @@ +# Troubleshoot common errors + +These are the most common workflow errors surfaced by the current CLI and Python API. + +## `remote.root is required` + +You will hit this when you ask DaggerML to bootstrap or recover a remote project without telling it where the remote data lives. + +Example: + +```bash +dml init --project-home ./demo-repo --remote-project dml://alice/demo +``` + +Fix it by adding `--remote-root s3://bucket/prefix` or setting `DML_REMOTE_ROOT` first. + +## `remote.project is required for project sync` + +You will hit this when you try `fetch`, `pull`, or `push` in a repo that has a remote root but no configured project URI. + +Fix it by setting `remote.project`: + +```bash +dml --project-home ./demo-repo config set remote.project dml://alice/demo +``` + +## `DAG not found: ` + +You will hit this when `load("name")` or `dml dag get name` points at a DAG that is not present in the selected revision. + +Start by checking what exists: + +```bash +dml --project-home ./demo-repo dag list +``` + +If the DAG exists on another revision, pass that revision explicitly. + +## `Current checkout is detached; attach HEAD to commit` + +You will hit this when you try to commit through the Python API while `HEAD` is detached. + +Fix it by reattaching `HEAD` to a branch before creating the DAG commit: + +```bash +dml --project-home ./demo-repo checkout main +``` + +## `Unknown kwarg: ` + +You will hit this when `dag.call(...)` passes a keyword argument that the runnable does not accept. + +Example: + +```python +result = dag.call(fn, 1, 2, 3, y=100) +``` + +Fix it by matching the runnable's declared keyword arguments. + +## `S3Store requires configured remote.root` + +You will hit this when you call `S3Store()` without a configured S3 remote root. + +Fix it in one of two ways: + +- configure `remote.root` for the repo and then call `S3Store()` +- or use `S3Store.from_remote_root("s3://bucket/prefix")` + +## When the CLI fails early + +The CLI prints normal command results as JSON. Failures are shown as errors on stderr instead. If a command shape looks right but still fails, check the built-in help for the generated option names: + +```bash +dml diff --help +dml pull --help +dml dag get --help +``` + +## Related docs + +- [Reference home](../reference/README.md) +- [CLI](../reference/cli.md) +- [Python API](../reference/python-api.md) +- [Errors](../reference/errors.md) diff --git a/docs/guides/work-with-remotes.md b/docs/guides/work-with-remotes.md new file mode 100644 index 0000000..6830be0 --- /dev/null +++ b/docs/guides/work-with-remotes.md @@ -0,0 +1,83 @@ +# Work with remotes + +DaggerML splits remote configuration into two pieces: + +- `remote.root`: where remote-backed data lives, such as `s3://bucket/prefix` +- `remote.project`: which project name to sync, such as `dml://alice/demo` + +You can have `remote.root` without `remote.project`, but project sync commands need both. + +## Initialize a repo with remote sync enabled + +```bash +dml init \ + --project-home ./demo-repo \ + --user alice@example.com \ + --remote-root s3://bucket/prefix \ + --remote-project dml://alice/demo +``` + +If you are adding remote sync to an existing repo, you can update config directly: + +```bash +dml --project-home ./demo-repo config set remote.root s3://bucket/prefix +dml --project-home ./demo-repo config set remote.project dml://alice/demo +``` + +## Check what remote config is active + +```bash +dml --project-home ./demo-repo config show +``` + +Python equivalent: + +```python +from daggerml import Dml + +dml = Dml(project_home="./demo-repo", remote_root="s3://bucket/prefix") + +print(dml.config.show()) +``` + +## Discover remote projects and refs + +```bash +dml --project-home ./demo-repo admin remote list --owner alice +dml --project-home ./demo-repo admin remote list --project dml://alice/demo +``` + +Use the first form to discover projects for one owner and the second to list that project's tracked branches and tags. + +## Sync history + +```bash +dml --project-home ./demo-repo fetch origin --branch main +dml --project-home ./demo-repo pull origin alice@example.com +dml --project-home ./demo-repo push --branch main --create +``` + +Python equivalents: + +```python +from daggerml import Dml + +dml = Dml(project_home="./demo-repo", remote_root="s3://bucket/prefix", user="alice@example.com") + +dml.fetch("origin", branch="main") +dml.pull("origin", user="alice@example.com") +dml.push(branch="main", create=True) +``` + +## When to use each command + +- `fetch` updates local history from a remote branch without changing your current branch. +- `pull` fetches a remote branch and merges it into a local branch. +- `push` publishes a local branch or tag to the configured remote project. + +## Related docs + +- [Remotes](../concepts/remotes.md) +- [CLI](../reference/cli.md) +- [Configuration](../reference/configuration.md) +- [Errors](../reference/errors.md) diff --git a/docs/reference/README.md b/docs/reference/README.md new file mode 100644 index 0000000..b7cfc36 --- /dev/null +++ b/docs/reference/README.md @@ -0,0 +1,10 @@ +# Reference + +Use this section when you need exact details about the user-facing surfaces DaggerML exposes today. + +- [Python API](python-api.md): package exports, `Dml`, DAG helpers, and node wrappers. +- [CLI](cli.md): generated `dml` commands, namespaces, flags, input parsing, and output behavior. +- [Configuration](configuration.md): resolved config shape, precedence, environment variables, and repo-local files. +- [Errors](errors.md): the error types and failure modes you will see from Python and the CLI. + +These pages track the user-facing code in `src/daggerml/` and `src/daggerml/_cli.py`. diff --git a/docs/reference/cli.md b/docs/reference/cli.md new file mode 100644 index 0000000..03c7499 --- /dev/null +++ b/docs/reference/cli.md @@ -0,0 +1,152 @@ +# CLI + +The CLI entrypoint is defined in `pyproject.toml` as: + +```toml +dml = "daggerml._cli:cli" +``` + +`src/daggerml/_cli.py` builds the command tree directly from the public `Dml` class and its namespace properties. + +## Global behavior + +Global flags: + +- `-v`, `-vv`, `-vvv`: increase logging verbosity +- `--project-home PATH` +- `--remote-root URI` +- `--user NAME` +- `--config-home PATH` + +Success output: + +- Commands print their return values as formatted JSON to `stdout`. + +Failure output: + +- Argument parsing errors print usage plus `error: ...` to `stderr` and exit with code `2`. +- `Ctrl+C` exits with code `130`. +- Other exceptions are logged and then printed as `error: ...` on `stderr`. + +Input parsing rules: + +- Required parameters become positional arguments. +- Parameters with defaults become `--kebab-case` options. +- Boolean options become `--flag` or `--no-flag`. +- `Ref` and `Uri` arguments are parsed from strings. +- `list[...]` and `dict[...]` arguments are parsed from JSON text. + +That last rule matters for commands such as `dml admin cache invalidate`, which currently expects one JSON list argument rather than repeated positional cache keys. + +## Top-level commands + +Generated directly from public `Dml` methods: + +- `dml init` +- `dml status` +- `dml branch` +- `dml log` +- `dml show` +- `dml diff` +- `dml checkout` +- `dml fetch` +- `dml pull` +- `dml push` +- `dml merge` +- `dml revert` + +Common examples: + +```bash +dml init --project-home . +dml status +dml branch +dml log HEAD --limit 5 +dml show HEAD +dml diff HEAD~1 HEAD +``` + +## `config` namespace + +Generated from `Dml.config`: + +- `dml config get KEY [--scope local|global]` +- `dml config set KEY VALUE [--scope local|global]` +- `dml config show [--contrib]` + +Examples: + +```bash +dml config get remote.root +dml config set remote.root s3://my-bucket/demo +dml config show +``` + +## `dag` namespace + +Generated from `Dml.dag`: + +- `dml dag list [--revision REV]` +- `dml dag describe VALUE [--revision REV]` +- `dml dag get VALUE [--revision REV]` +- `dml dag describe-node NODE [--dag DAG] [--revision REV]` +- `dml dag get-node NODE [--dag DAG] [--revision REV]` +- `dml dag unroll-node NODE [--dag DAG] [--revision REV]` +- `dml dag checkout REVISION DAG_NAME [--branch BRANCH] [--target-name NAME] [--replace] [--user USER]` +- `dml dag delete NAME [--branch BRANCH] [--user USER]` + +Use this namespace for committed DAG history and inspection, not for staging new DAGs from the shell. + +## `runtime` namespace + +Generated from `Dml.runtime`: + +- `create` +- `get-node` +- `get-argv` +- `put-literal` +- `put-import` +- `set-node-name` +- `start-fn` +- `commit` +- `list` +- `describe` +- `cancel` + +This is the low-level mutable staging surface behind `daggerml.api.new()` and `Dag`. + +## `admin` namespace + +Generated from `Dml.admin` and nested namespaces: + +- `dml admin index list|get|delete` +- `dml admin cache invalidate` +- `dml admin remote list|gc` +- `dml admin gc [--dry-run]` + +Examples: + +```bash +dml admin index list +dml admin cache invalidate '["cache-key-1","cache-key-2"]' +dml admin remote list +dml admin gc --dry-run +``` + +## CLI-only limits + +The CLI is generated from typed Python signatures, so it only exposes argument shapes that can be represented at the command line. + +Not available through `dml`: + +- passing live Python callables +- `@api.funkify`-style workflows +- arbitrary in-process Python object serialization + +Use the Python API for those flows. + +## Related pages + +- [Python API](python-api.md) +- [Configuration](configuration.md) +- [Errors](errors.md) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md new file mode 100644 index 0000000..4e1a8c4 --- /dev/null +++ b/docs/reference/configuration.md @@ -0,0 +1,133 @@ +# Configuration + +DaggerML uses one resolved configuration model across the Python API and the CLI. + +## Resolved shape + +The resolved config returned by the internal resolver has this shape: + +```json +{ + "project": { + "home": "string-or-null" + }, + "db": { + "path": "string-or-null" + }, + "remote": { + "project": "string-or-null", + "root": "string", + "fetch_workers": 16 + }, + "user": "string-or-null", + "default_branch": "main", + "config_home": "string" +} +``` + +Canonical keys: + +- `project.home` +- `db.path` +- `remote.project` +- `remote.root` +- `remote.fetch_workers` +- `user` +- `default_branch` +- `config_home` + +## Precedence + +Resolved values are layered in this order: + +1. defaults +2. global config +3. project config for project-scoped resolution +4. environment variables +5. explicit constructor or CLI overrides + +Notes: + +- Later layers override earlier ones key by key. +- Empty or missing higher-precedence values do not erase lower-precedence values. +- For project-scoped resolution, `project.home` defaults to the current working directory when not provided. + +## Environment variables + +- `DML_PROJECT_HOME` -> `project.home` +- `DML_DB_PATH` -> `db.path` +- `DML_REMOTE_PROJECT` -> `remote.project` +- `DML_REMOTE_ROOT` -> `remote.root` +- `DML_REMOTE_FETCH_WORKERS` -> `remote.fetch_workers` +- `DML_USER` -> `user` +- `DML_DEFAULT_BRANCH` -> `default_branch` +- `DML_CONFIG_HOME` -> `config_home` + +Global config home resolution: + +1. `DML_CONFIG_HOME` +2. `$XDG_CONFIG_HOME/dml` +3. `~/.config/dml` + +## Repo-local files + +Project state lives under `.dml/` inside `project.home`. + +- `.dml/config.toml`: repo-local remote settings +- `.dml/db/`: local object database +- `.dml/HEAD`: current checkout state +- `.dml/.gitignore`: created during init + +`Dml.init(...)` creates `.dml/`, writes `.dml/.gitignore`, writes `.dml/config.toml` if needed, and creates the database when it does not already exist. + +## Field rules + +- `default_branch` defaults to `main`. +- `db.path` defaults to `/.dml/db` for project-scoped resolution. +- `remote.fetch_workers` must be a positive integer and defaults to `16`. +- `remote.root` must be empty or an `s3://bucket` or `s3://bucket/prefix` URI. +- `remote.project` must be a bare `dml:///` URI. +- `remote.project` may not include `#branch` or `@tag` in config. + +`remote.root` enables remote-backed execution and storage. `remote.project` is the additional setting required for project-addressed sync such as `push`, `pull`, and `fetch` against a configured project. + +## Python and CLI entrypoints + +Python: + +```python +from daggerml import Dml + +dml = Dml( + project_home=".", + remote_root="s3://my-bucket/demo", + user="alice@example", +) +``` + +CLI: + +```bash +dml --project-home . --remote-root s3://my-bucket/demo status +dml config show +dml config set remote.root s3://my-bucket/demo +``` + +## Config file locations and contents + +Global config is read from `config.toml` under the resolved config home. The current resolver reads: + +- `[user].name` +- `[defaults].branch` +- `[remote].fetch_workers` + +Project config is read from `.dml/config.toml`. The current resolver reads: + +- `[remote].project` +- `[remote].root` +- `[remote].fetch_workers` + +## Related pages + +- [CLI](cli.md) +- [Errors](errors.md) diff --git a/docs/reference/errors.md b/docs/reference/errors.md new file mode 100644 index 0000000..7eb652e --- /dev/null +++ b/docs/reference/errors.md @@ -0,0 +1,80 @@ +# Errors + +DaggerML surfaces a small set of important error layers to users. + +## Python-visible error types + +### `daggerml.Error` + +`Error` is both an exception type and a stored DAG value. It carries: + +- `message` +- `origin` +- `type` +- `stack` + +When a DAG is used as a context manager, uncaught exceptions are converted with `Error.from_ex(...)` and committed as the DAG result. + +### `DmlRepoError` + +Most repository, runtime, config, revision, and execution failures surface as `DmlRepoError`, a subclass of `Error`. + +Common examples from the current code: + +- `No default Dml is configured` +- `DAG not found: ` +- `No active index` +- `Cannot set node names on a committed DAG.` +- `Current checkout is detached; attach HEAD to commit` +- `remote.project is required for project sync` +- `remote.root is required` +- `Unknown kwarg: ` +- `Adapter output must be JSON` +- `Remote context required for adapter invocation` + +### Low-level database errors + +The native database layer defines `DmlDbError` and many subclasses such as `DmlDbMapFullError` and `DmlDbEnvReopenedError`. + +Those types are mostly internal. The repo layer retries some of them automatically and usually re-surfaces user-facing failures as `DmlRepoError`. + +## What different surfaces do + +### Python API + +- `load(name, ...)` raises `DmlRepoError` for a missing DAG name. +- `Dag.call(...)` and `RunnableNode(...)` raise `TimeoutError` when the timeout expires. +- Invalid node-name key types raise `TypeError`. +- Codec staging failures are wrapped as `DmlRepoError`. +- `node.argv` raises `Error("Node has no argv", origin="dml", type="TypeError")` when the node has no argv list. + +### CLI + +The current CLI does not return a structured JSON error envelope. + +- Parse and usage failures come from `argparse` and exit with code `2`. +- `KeyboardInterrupt` exits with code `130`. +- Other failures are logged with `logging.exception(...)` and then printed as `error: ` on `stderr`. +- Successful command results still go to `stdout` as JSON. + +## Retry behavior + +The internal transaction wrapper retries whole operations when the database raises: + +- `DmlDbMapFullError`: after resizing the database +- `DmlDbEnvReopenedError`: after the environment has been reopened + +That retry loop is internal; callers generally see either success or the final surfaced exception. + +## Practical debugging tips + +- If a sync command fails with `remote.project is required for project sync`, configure `remote.project` in `.dml/config.toml` or with `dml config set`. +- If a remote-backed flow fails with `remote.root is required`, set `remote.root` in config, via environment, or through `Dml(...)` / CLI flags. +- If a function call fails with `Unknown kwarg: ...`, check the runnable's accepted keyword parameters. +- If an adapter run fails with `Adapter output must be JSON`, inspect the adapter process output before it reaches DaggerML. + +## Related pages + +- [Python API](python-api.md) +- [CLI](cli.md) +- [Configuration](configuration.md) diff --git a/docs/reference/python-api.md b/docs/reference/python-api.md new file mode 100644 index 0000000..650e831 --- /dev/null +++ b/docs/reference/python-api.md @@ -0,0 +1,160 @@ +# Python API + +The main Python entrypoint is the package root: + +```python +from daggerml import Dml, Error, Ref, Runnable, Uri +from daggerml import clear_default_dml, get_default_dml, load, new, set_default_dml +from daggerml import status, temporary, use_default_dml +``` + +Those names come from `src/daggerml/__init__.py` and `src/daggerml/api.py`. + +## Runtime and helper exports + +### `Dml` + +`Dml` is the session object for repository, history, runtime, DAG, and admin workflows. + +Constructor arguments: + +- `project_home: str | None = None` +- `remote_root: str | None = None` +- `user: str | None = None` +- `config_home: str | None = None` + +Top-level methods: + +- `Dml.init(...)`: initialize a repository under `project_home`. +- `status()`: current HEAD, branches, DAGs, and open runtime indexes. +- `branch(remote=False)`: list local branches or discovered remote-tracking branches. +- `log(revision="HEAD", limit=None)` +- `show(revision="HEAD")` +- `diff(left="HEAD~1", right="HEAD")` +- `checkout(revision)` +- `fetch(remote_or_uri, branch=None)` +- `pull(remote_or_uri, remote_branch=None, *, branch=None, user)` +- `push(tag=None, *, branch=None, create=False, force=False)` +- `merge(revision, *, branch=None, user)` +- `revert(revision, *, branch=None, user)` + +Namespaces exposed as properties: + +- `dml.config`: `get`, `set`, `show` +- `dml.runtime`: create, inspect, mutate, commit, list, and cancel runtime indexes +- `dml.dag`: list, describe, get, inspect nodes, copy a DAG from history, and delete a DAG +- `dml.admin`: index, cache, remote, and GC operations + +### Default-runtime helpers + +The convenience helpers in `daggerml.api` keep a process-global or context-local default `Dml` instance. + +- `get_default_dml()` returns the active default and creates one if needed. +- `set_default_dml(dml)` installs a process default. +- `clear_default_dml()` removes the process default. +- `use_default_dml(dml)` temporarily overrides the default in the current context. +- `status()` returns both default-runtime metadata and `dml.status()`. + +Resolution order: + +1. the active `use_default_dml(...)` scoped override +2. the process default set through `set_default_dml(...)` +3. a lazily created implicit `Dml()` instance + +That implicit instance is cached after first creation, so later top-level helpers reuse it instead of constructing a fresh runtime each time. + +`status()` is designed to stay JSON-serializable. It reports the default-runtime source plus the active runtime's config and repository status instead of returning live Python objects. + +### Repository helpers + +- `new(name="", *, message="", argv_ptr=None, dml=None) -> Dag` +- `load(name: str, dml=None) -> Dag` +- `temporary(**kw)` yields a temporary `Dml` initialized in a temporary directory + +`load()` looks up a named DAG through `dml.dag.get(name)` and raises `DmlRepoError(f"DAG not found: {name}")` when the name is missing. + +## Working with DAGs + +`new()` returns a mutable `Dag` wrapper backed by a runtime index. + +```python +from daggerml import Dml, new + +dml = Dml(project_home=".") + +dag = new("demo", message="first dag", dml=dml) +answer = dag.put(42, name="answer") +dag.commit(answer) +``` + +Important `Dag` behavior: + +- `dag["name"]` is the canonical named-node lookup. +- `dag.name` falls back to named-node lookup only when `name` is not already a real `Dag` attribute. +- `dag.result` is the committed DAG result property. +- `dag["result"]` looks up a node literally named `"result"`. +- `dag.put(value, name=None)` stages Python values through the codec system and returns a `Node` wrapper. +- `dag.call(fn, *args, name=None, sleep=None, timeout=-1, **kw)` stages a function call and returns the result node. +- `dag.commit(value)` writes the DAG result into repository history. +- If you use `Dag` as a context manager, uncaught exceptions are converted to `Error` values and committed. + +Using `with dag:` is for error capture. Successful DAGs still need an explicit `dag.commit(...)` call. + +`Dag.call()` stages plain Python values and existing node arguments into the current runtime index before execution. It raises `TimeoutError` if the call does not finish before `timeout`. + +## Node wrappers + +Every staged or committed node is wrapped as one of these classes: + +- `ScalarNode`: scalar values, including `Uri` +- `RunnableNode`: callable `Runnable` values +- `ListNode`: list-like values +- `DictNode`: dict-like values + +Common node methods: + +- `node.value()`: materialize the concrete value +- `node.load()`: load the DAG that owns the node +- `node.argv`: access the node's argv list when present +- `node.type`: cached type label such as `list`, `dict`, or `runnable` + +Collection helpers: + +- `ListNode[i]`, `ListNode[start:stop]` +- `ListNode.append(item)` / `ListNode.conj(item)` +- `DictNode[key]` +- `DictNode.get(key, default=None)` +- `CollectionNode.contains(item)` + +`RunnableNode(*args, **kw)` delegates to `dag.call(...)` and returns another node. + +## Value types + +The public value wrappers re-exported from the package are: + +- `Ref`: persistent object reference +- `Uri`: URI-backed datum +- `Runnable`: callable datum stored in the graph +- `Error`: captured execution error with `message`, `origin`, `type`, and `stack` + +## Example: create and read a DAG + +```python +from daggerml import Dml, load, new + +dml = Dml(project_home=".") + +dag = new("numbers", message="store a list", dml=dml) +values = dag.put([1, 2, 3], name="values") +dag.commit(values) + +saved = load("numbers", dml=dml) +print(saved["values"].value()) +print(saved.result.value()) +``` + +## Related pages + +- [CLI](cli.md) +- [Configuration](configuration.md) +- [Errors](errors.md) diff --git a/examples/00-hello_world.py b/examples/00-hello_world.py new file mode 100644 index 0000000..9abea65 --- /dev/null +++ b/examples/00-hello_world.py @@ -0,0 +1,34 @@ +"""Run a minimal local-script hello world through `@api.funkify`. + +This example executes a simple funkified Python function with the local script +runtime using an already configured remote URI. +""" + +from __future__ import annotations + +import daggerml as dml +from daggerml.contrib import api + + +@api.funkify(uri="script", adapter="local") +def hello(dag, arg): + from uuid import uuid4 + + arg = arg.value() + return f"{uuid4() = !s} and {arg = }." + + +def main() -> None: + with dml.new(name="examples/00-hello-world") as dag: + dag.hello_fn = hello + result = dag.call(hello, 23, name="greeting") + dag.commit(result) + loaded = dml.load("examples/00-hello-world") + print(loaded.result.value()) + with dml.new(name="examples/00-hello-world-redux") as dag: + print(dag.call(hello, 23).value()) + print(dag.call(hello, 42).value()) + + +if __name__ == "__main__": + main() diff --git a/examples/01-docker_dataset.py b/examples/01-docker_dataset.py new file mode 100644 index 0000000..75ab918 --- /dev/null +++ b/examples/01-docker_dataset.py @@ -0,0 +1,125 @@ +"""Run an end-to-end Docker-backed dataset pipeline. + +The example builds a Docker image from this repository, loads the iris dataset +in one Docker-executed funk, and trains a small classifier in another. It +exercises the contrib runtime end to end: script funkification, Docker +execution, remote cache publication, and S3-backed artifact exchange between +DAG nodes. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from time import time +from urllib.parse import urlparse + +import daggerml as dml +from daggerml.contrib import api +from daggerml.contrib.funks import docker_build +from daggerml.contrib.s3 import S3Store + +EXCLUDE_PATTERNS = ( + # ".git", # we need .git to install lib from the repo + "ignore/*", + ".venv/*", + ".mypy_cache/*", + ".pytest_cache/*", + "__pycache__/*", + "*.pyc", + "tests/*", +) +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _docker_run_flags() -> list[str]: + flags: list[str] = [] + endpoint = os.environ.get("AWS_ENDPOINT_URL") + if endpoint: + parsed = urlparse(endpoint) + if parsed.scheme == "http" and parsed.port is not None: + flags.extend( + [ + "--add-host=host.docker.internal:host-gateway", + "-e", + f"AWS_ENDPOINT_URL=http://host.docker.internal:{parsed.port}", + ] + ) + for key in ( + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + "AWS_REGION", + "AWS_DEFAULT_REGION", + ): + value = os.environ.get(key) + if value: + flags.extend(["-e", f"{key}={value}"]) + return flags + + +@api.funkify(uri="docker", image=api.ref("image"), flags=api.ref("dkr-flags")) +@api.funkify +def download_dataset(dag): + from sklearn.datasets import load_iris # pyright:ignore[reportMissingImports] # noqa:F401 + + return load_iris(as_frame=True).frame.dropna() + + +@api.funkify(uri="docker", image=api.ref("image"), flags=api.ref("dkr-flags")) +@api.funkify +def predict_target(dag, dataset, params): + import pandas as pd # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.linear_model import LogisticRegression # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.metrics import r2_score # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.model_selection import train_test_split # pyright:ignore[reportMissingImports] # noqa:F401 + + df = pd.read_parquet(dataset.value().uri) + X = df.drop(columns=["target"]) + y = df["target"] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) + model = LogisticRegression(**params.value()) + model.fit(X_train, y_train) + train_r2 = r2_score(y_train, model.predict(X_train)) + test_r2 = r2_score(y_test, model.predict(X_test)) + return {"train": train_r2, "test": test_r2} + + +def main() -> None: + flags = _docker_run_flags() + try: + import pandas # pyright:ignore[reportMissingImports] # noqa:F401 + + raise RuntimeError("pandas should not be installed in the local environment for this example to work") + except ModuleNotFoundError: + pass + with dml.new(name="examples/01-docker-dataset") as dag: + dag.dkr_build = docker_build + s3 = S3Store() + print("Creating Docker build context from repo root, excluding patterns:", EXCLUDE_PATTERNS) + dkr_ctx = s3.tar(str(REPO_ROOT), excludes=EXCLUDE_PATTERNS, symlinks="ignore") + dag.put(flags, name="dkr-flags") + print("Building Docker image (this may take a moment)...") + t0 = time() + dag.dkr_build(dkr_ctx, build_flags=["-f", "./examples/dkr-ctx/Dockerfile"], name="image") + t1 = time() + dag.download = download_dataset + print("Loading dataset within Docker...") + dataset = dag.download(name="dataset") + print("Training model and generating predictions within Docker...") + dag.predict_fn = predict_target + predictions = dag.predict_fn( + dataset, + {"max_iter": 200, "solver": "saga", "penalty": "elasticnet", "l1_ratio": 0.2}, + name="predictions", + ) + print("Committing DAG to persist artifacts...") + dag.commit(predictions) + print("Reading predictions parquet from S3...") + print(json.dumps(predictions.value(), indent=2)) + print(f"\nBuild time: {t1 - t0:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/examples/01b-load_fn.py b/examples/01b-load_fn.py new file mode 100644 index 0000000..8b2647a --- /dev/null +++ b/examples/01b-load_fn.py @@ -0,0 +1,26 @@ +"""Run an end-to-end Docker-backed dataset pipeline. + +The example builds a Docker image from this repository, loads the iris dataset +in one Docker-executed funk, and trains a small classifier in another. It +exercises the contrib runtime end to end: script funkification, Docker +execution, remote cache publication, and S3-backed artifact exchange between +DAG nodes. +""" + +import daggerml as dml + + +def main() -> None: + with dml.new(name="examples/01b-load-fn") as dag: + print("Training model and generating predictions within Docker...") + loaded_dag = dml.load("examples/00-hello-world") + dag.old_result = loaded_dag.greeting + dag.hello_fn = loaded_dag.hello_fn + # dag.hello_fn = loaded_dag.greeting.load().argv.value()[1] + print(dag.hello_fn(42).value()) + print(dag.hello_fn(-1).value()) + dag.commit(dag.hello_fn(42)) + + +if __name__ == "__main__": + main() diff --git a/examples/02-ssh_docker_dataset.py b/examples/02-ssh_docker_dataset.py new file mode 100644 index 0000000..6f37d32 --- /dev/null +++ b/examples/02-ssh_docker_dataset.py @@ -0,0 +1,202 @@ +"""Run the Docker dataset pipeline through SSH. + +This example is the SSH-backed sibling of ``01-docker_dataset.py``. It starts a +local sshd that points back to the current machine, writes an env file for the +remote SSH session, builds the same Docker image from this repository, and then +executes the Docker-backed funks over SSH. +""" + +from __future__ import annotations + +import getpass +import json +import os +import shlex +import shutil +import socket +import subprocess +import sys +import time +from contextlib import contextmanager +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import NamedTuple + +import daggerml as dml +from daggerml.contrib import api + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +class SshServer(NamedTuple): + host: str + flags: list[str] + env_file: str + + +def _require_local_tools() -> None: + missing = [name for name in ("docker", "ssh", "sshd", "ssh-keygen") if shutil.which(name) is None] + if missing: + raise RuntimeError(f"Missing required local tools: {', '.join(missing)}") + + +def _start_local_sshd(tmpdir: str) -> tuple[subprocess.Popen[bytes], list[str], str]: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + + host_key_path = Path(tmpdir) / "ssh_host_ed25519_key" + client_key_path = Path(tmpdir) / "client_ed25519_key" + authorized_keys_path = Path(tmpdir) / "authorized_keys" + sshd_config_path = Path(tmpdir) / "sshd_config" + pid_file = Path(tmpdir) / "sshd.pid" + + subprocess.run(["ssh-keygen", "-q", "-t", "ed25519", "-N", "", "-f", str(host_key_path)], check=True) + subprocess.run(["ssh-keygen", "-q", "-t", "ed25519", "-N", "", "-f", str(client_key_path)], check=True) + shutil.copyfile(client_key_path.with_suffix(".pub"), authorized_keys_path) + authorized_keys_path.chmod(0o600) + + sshd_config_path.write_text( + dedent( + f""" + Port {port} + ListenAddress 127.0.0.1 + HostKey {host_key_path} + PidFile {pid_file} + LogLevel VERBOSE + StrictModes no + PasswordAuthentication no + KbdInteractiveAuthentication no + ChallengeResponseAuthentication no + PubkeyAuthentication yes + AuthorizedKeysFile {authorized_keys_path} + UsePAM no + PermitRootLogin no + """ + ).strip() + + "\n" + ) + + sshd_path = shutil.which("sshd") + assert sshd_path is not None + sshd_proc = subprocess.Popen( + [sshd_path, "-D", "-e", "-f", str(sshd_config_path)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + deadline = time.time() + 5.0 + while time.time() < deadline: + if sshd_proc.poll() is not None: + stdout, stderr = sshd_proc.communicate(timeout=1) + raise RuntimeError( + "local sshd failed to start:\n" + f"stdout: {stdout.decode(errors='replace')}\n" + f"stderr: {stderr.decode(errors='replace')}" + ) + try: + with socket.create_connection(("127.0.0.1", port), timeout=0.25): + break + except OSError: + time.sleep(0.1) + else: + sshd_proc.terminate() + raise RuntimeError("timeout waiting for local sshd to start") + + flags = [ + "-i", + str(client_key_path), + "-p", + str(port), + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "IdentitiesOnly=yes", + ] + host = f"{getpass.getuser()}@127.0.0.1" + return sshd_proc, flags, host + + +def _write_ssh_env_file(tmpdir: str) -> str: + env_file = Path(tmpdir) / "ssh.env" + exports = { + "PATH": f"{Path(sys.executable).parent}:{os.environ.get('PATH', '')}", + "UV_PROJECT": str(REPO_ROOT), + "DML_REMOTE_ROOT": os.environ["DML_REMOTE_ROOT"], + **{k: v for k, v in os.environ.items() if k.startswith("AWS_")}, + } + env_file.write_text( + "\n".join(f"export {name}={shlex.quote(value)}" for name, value in sorted(exports.items())) + "\n" + ) + return str(env_file) + + +@contextmanager +def ssh_server(dag): + sshd_proc = None + try: + with TemporaryDirectory(prefix="daggerml-ssh-example-") as tmpdir: + sshd_proc, ssh_flags, ssh_host = _start_local_sshd(tmpdir) + ssh_env_file = _write_ssh_env_file(tmpdir) + dag.put(ssh_host, name="ssh-host") + dag.put(ssh_flags, name="ssh-flags") + dag.put([ssh_env_file], name="ssh-env-files") + yield SshServer(ssh_host, ssh_flags, ssh_env_file) + return + finally: + if sshd_proc is not None: + sshd_proc.terminate() + try: + sshd_proc.wait(timeout=5) + except subprocess.TimeoutExpired: + sshd_proc.kill() + + +@api.funkify( + uri="ssh", + adapter="local", + host=api.ref("ssh-host"), + flags=api.ref("ssh-flags"), + env_files=api.ref("ssh-env-files"), +) # send over ssh +@api.funkify(uri="docker", image=api.ref("image"), flags=api.ref("dkr-flags")) # run in docker +@api.funkify # defaults to: run in a python subprocess +def predict_target(dag, dataset, params): + import pandas as pd # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.metrics import r2_score # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.model_selection import train_test_split # pyright:ignore[reportMissingImports] # noqa:F401 + from sklearn.neighbors import KNeighborsClassifier # pyright:ignore[reportMissingImports] # noqa:F401 + + df = pd.read_parquet(dataset.value().uri) + X = df.drop(columns=["target"]) + y = df["target"] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) + model = KNeighborsClassifier(**params.value()) + model.fit(X_train, y_train) + train_r2 = r2_score(y_train, model.predict(X_train)) + test_r2 = r2_score(y_test, model.predict(X_test)) + return {"train": train_r2, "test": test_r2} + + +def main() -> None: + _require_local_tools() + with dml.new(name="examples/02-ssh-docker-dataset") as dag: + loaded_dag = dml.load("examples/01-docker-dataset") + dag.image = loaded_dag.image + dag.dataset = loaded_dag.dataset + dag.put(loaded_dag["dkr-flags"], name="dkr-flags") + with ssh_server(dag): + print("Training model and generating predictions within Docker over SSH...") + predictions = dag.call(predict_target, dag.dataset, {}, name="predictions") + print("Committing DAG to persist artifacts...") + dag.commit(predictions) + print("Reading predictions parquet from S3...") + print(json.dumps(predictions.value(), indent=2)) + + +if __name__ == "__main__": + main() diff --git a/examples/03-load_docker_dataset.py b/examples/03-load_docker_dataset.py new file mode 100644 index 0000000..0233976 --- /dev/null +++ b/examples/03-load_docker_dataset.py @@ -0,0 +1,33 @@ +"""Run an end-to-end Docker-backed dataset pipeline. + +The example builds a Docker image from this repository, loads the iris dataset +in one Docker-executed funk, and trains a small classifier in another. It +exercises the contrib runtime end to end: script funkification, Docker +execution, remote cache publication, and S3-backed artifact exchange between +DAG nodes. +""" + +import json + +import daggerml as dml + + +def main() -> None: + with dml.new(name="examples/03-load-docker-dataset") as dag: + print("Training model and generating predictions within Docker...") + loaded_dag = dml.load("examples/01-docker-dataset") + dag.predict_fn = loaded_dag.predict_fn + dag.dataset = loaded_dag.dataset + predictions = dag.predict_fn( + dag.dataset, + {"max_iter": 200, "solver": "saga", "penalty": "elasticnet", "l1_ratio": 0.5}, + name="predictions", + ) + print("Committing DAG to persist artifacts...") + dag.commit(predictions) + print("Reading predictions parquet from S3...") + print(json.dumps(predictions.value(), indent=2)) + + +if __name__ == "__main__": + main() diff --git a/examples/dkr-ctx/Dockerfile b/examples/dkr-ctx/Dockerfile new file mode 100644 index 0000000..e316879 --- /dev/null +++ b/examples/dkr-ctx/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.13-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential cmake git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY . /app + +RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir "/app" pandas scikit-learn pyarrow s3fs + +ENV PYTHONUNBUFFERED=1 diff --git a/examples/moto_server_env.py b/examples/moto_server_env.py new file mode 100644 index 0000000..ca4f935 --- /dev/null +++ b/examples/moto_server_env.py @@ -0,0 +1,83 @@ +"""Start a local moto server and write sourceable AWS env vars. + +This helper mirrors the local moto setup used across examples: it starts moto on +an ephemeral port, creates the example bucket, writes a shell env file, and +keeps running until Ctrl-C. +""" + +from __future__ import annotations + +import os +import shlex +import tempfile +import time +from pathlib import Path + +import boto3 + + +def _write_env_file(env_values: dict[str, str]) -> Path: + fd, path_str = tempfile.mkstemp(prefix="daggerml-moto-", suffix=".env") + path = Path(path_str) + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write("# Source this file to configure local moto-backed DaggerML examples.\n") + for key, value in env_values.items(): + f.write(f"export {key}={shlex.quote(value)}\n") + return path + + +def main() -> None: + for var in os.environ.keys(): + if var.startswith("AWS_") or var.startswith("DML_"): + del os.environ[var] + try: + from moto.server import ThreadedMotoServer + except ModuleNotFoundError as e: + raise RuntimeError("Install moto[server] to run this helper: pip install 'moto[server]'") from e + + server = ThreadedMotoServer(port=0, verbose=False) + env_file: Path | None = None + try: + server.start() + host, port = server.get_host_and_port() + endpoint = f"http://{host}:{port}" + + env_values = { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_SHARED_CREDENTIALS_FILE": "/dev/null", + "AWS_ENDPOINT_URL": endpoint, + "DML_REMOTE_ROOT": "s3://daggerml-example/artifacts", + } + + for key, value in env_values.items(): + os.environ[key] = value + + boto3.client("s3", endpoint_url=endpoint).create_bucket(Bucket="daggerml-example") + env_file = _write_env_file(env_values) + + print("Moto server started.") + print(f" Endpoint: {endpoint}") + print(" Bucket: daggerml-example") + print(f" DML_REMOTE_ROOT: {env_values['DML_REMOTE_ROOT']}") + print() + print(f"Env file written: {env_file}") + print(f"Source it with: source {env_file}") + print("\nPress Ctrl-C to stop moto and clean up.") + + while True: + time.sleep(1) + except KeyboardInterrupt: + print("\nCtrl-C received, shutting down...") + finally: + if env_file is not None and env_file.exists(): + env_file.unlink() + print(f"Deleted env file: {env_file}") + server.stop() + print("Moto server stopped.") + + +if __name__ == "__main__": + main() diff --git a/examples/run_examples_integration.sh b/examples/run_examples_integration.sh new file mode 100755 index 0000000..91075f8 --- /dev/null +++ b/examples/run_examples_integration.sh @@ -0,0 +1,208 @@ +#!/usr/bin/env bash + +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +examples_dir="${repo_root}/examples" +ignore_dir="${repo_root}/ignore" +scratch_dir="${ignore_dir}/scratch" +moto_dir="${ignore_dir}/.integration-moto-$(date +%s)-$$" +moto_env_file="${moto_dir}/moto.env" +moto_log_file="${moto_dir}/moto.log" +moto_pid="" +export DML_CONFIG_HOME="${scratch_dir}/dml_config" + +log() { + echo + echo "*** $* ***" +} + +s3_ls_recursive() { + local s3_uri="$1" + python - "$s3_uri" <<'PY' +from __future__ import annotations + +import os +import sys +from urllib.parse import urlparse + +import boto3 + + +def main() -> None: + uri = sys.argv[1] + parsed = urlparse(uri) + if parsed.scheme != "s3" or not parsed.netloc: + raise RuntimeError(f"expected s3://bucket[/prefix], got: {uri!r}") + + bucket = parsed.netloc + prefix = parsed.path.lstrip("/") + endpoint_url = os.environ.get("AWS_ENDPOINT_URL") or None + + client = boto3.client("s3", endpoint_url=endpoint_url) + paginator = client.get_paginator("list_objects_v2") + + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + dt = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S") + size = obj["Size"] + key = obj["Key"] + print(f"{dt} {size:>10d} {key}") + + +if __name__ == "__main__": + main() +PY +} + +cleanup() { + if [[ -n "${moto_pid}" ]]; then + kill "${moto_pid}" >/dev/null 2>&1 || true + wait "${moto_pid}" >/dev/null 2>&1 || true + fi + rm -rf "${moto_dir}" + if [[ "${KEEP_EXAMPLE_SCRATCH:-0}" == "1" ]]; then + log "Keeping scratch directory: ${scratch_dir}" + return + fi + rm -rf "${scratch_dir}" +} +trap cleanup EXIT + +mkdir -p "${moto_dir}" +mkdir -p "${DML_CONFIG_HOME}" +dml_user="cool-guy" +dml config set --scope global user $dml_user + +log "Starting moto server and preparing env..." +python - "${moto_env_file}" >"${moto_log_file}" 2>&1 <<'PY' & +from __future__ import annotations + +import os +import shlex +import signal +import sys +import time +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +from moto.server import ThreadedMotoServer + + +def main() -> None: + env_file = Path(sys.argv[1]) + remote_root = os.environ.get("DML_EXAMPLE_REMOTE_ROOT", "s3://daggerml-example/artifacts") + parsed = urlparse(remote_root) + if parsed.scheme != "s3" or not parsed.netloc: + raise RuntimeError(f"DML_EXAMPLE_REMOTE_ROOT must be s3://bucket[/prefix], got: {remote_root!r}") + bucket = parsed.netloc + + server = ThreadedMotoServer(port=0, verbose=False) + server.start() + host, port = server.get_host_and_port() + endpoint = f"http://{host}:{port}" + + env_values = { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_SHARED_CREDENTIALS_FILE": "/dev/null", + "AWS_ENDPOINT_URL": endpoint, + "DML_REMOTE_ROOT": remote_root, + } + + for key, value in env_values.items(): + os.environ[key] = value + + boto3.client("s3", endpoint_url=endpoint).create_bucket(Bucket=bucket) + env_file.write_text("\n".join(f"export {k}={shlex.quote(v)}" for k, v in env_values.items()) + "\n") + + stop = False + + def _handle_signal(_signum, _frame): + nonlocal stop + stop = True + + signal.signal(signal.SIGTERM, _handle_signal) + signal.signal(signal.SIGINT, _handle_signal) + + while not stop: + time.sleep(0.2) + + server.stop() + + +if __name__ == "__main__": + main() +PY +moto_pid="$!" + +for _ in $(seq 1 100); do + if [[ -s "${moto_env_file}" ]]; then + break + fi + if ! kill -0 "${moto_pid}" >/dev/null 2>&1; then + log "Moto bootstrap failed. Log output:" >&2 + cat "${moto_log_file}" >&2 + exit 1 + fi + sleep 0.1 +done + +if [[ ! -s "${moto_env_file}" ]]; then + log "Timed out waiting for moto env file. Log output:" >&2 + cat "${moto_log_file}" >&2 + exit 1 +fi + +# shellcheck disable=SC1090 +source "${moto_env_file}" +log "Moto ready: ${AWS_ENDPOINT_URL}" +log "Remote root: ${DML_REMOTE_ROOT}" + +log "Setting up DML repo in ${ignore_dir}" +mkdir -p "${scratch_dir}" || true +printf '*\n' > "${ignore_dir}/.gitignore" +cd "${scratch_dir}" + +project0="project-0" +log "Initializing DML repo in ${project0}" +mkdir "${scratch_dir}/${project0}" +cd "${scratch_dir}/${project0}" +dml init --remote-project "dml://${dml_user}/${project0}" + +log "DML repo initialized. Current status:" +dml status | jq . + +log "Running example: 00-hello_world.py" +python "${examples_dir}/00-hello_world.py" + +log "Running example: 01-docker_dataset.py" +python "${examples_dir}/01-docker_dataset.py" + +log "Running example: 02-ssh_docker_dataset.py" +python "${examples_dir}/02-ssh_docker_dataset.py" + +log "Listing DML refs after running all examples:" +s3_ls_recursive "${DML_REMOTE_ROOT}/dml/refs/projects/" +dml push --create +s3_ls_recursive "${DML_REMOTE_ROOT}/dml/refs/projects/" + +log "Cleaning up first project to test fresh init with existing remote" +cd .. && rm -rf "${project0}" + +## Second "project" +project1="project-1" +log "Initializing DML repo in ${project1}" +mkdir "${scratch_dir}/${project1}" +cd "${scratch_dir}/${project1}" +dml init --remote-root "${DML_REMOTE_ROOT}" --remote-project "dml://${dml_user}/${project1}" +dml fetch "dml://${dml_user}/${project0}" +dml dag checkout "dml://${dml_user}/${project0}#main" "examples/01-docker-dataset" + +log "Running example: 03-load_docker_dataset.py" +python "${examples_dir}/03-load_docker_dataset.py" + +log "All examples completed successfully." diff --git a/examples/run_examples_integration_no_docker.sh b/examples/run_examples_integration_no_docker.sh new file mode 100755 index 0000000..dd0abcd --- /dev/null +++ b/examples/run_examples_integration_no_docker.sh @@ -0,0 +1,284 @@ +#!/usr/bin/env bash +# Note: you might have to run this script in `uv` + +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +examples_dir="${repo_root}/examples" +ignore_dir="${repo_root}/ignore" +scratch_dir="${ignore_dir}/scratch" +moto_dir="${ignore_dir}/.integration-moto-$(date +%s)-$$" +moto_env_file="${moto_dir}/moto.env" +moto_log_file="${moto_dir}/moto.log" +moto_pid="" +export DML_CONFIG_HOME="${scratch_dir}/dml_config" + +log() { + echo >&2 + echo "*** $* ***" >&2 +} + +pretty_dml() { + log "Calling: dml $*" + dml "$@" | jq . +} + +json_scalar() { + jq -r . +} + +s3_ls_recursive() { + local s3_uri="$1" + python - "$s3_uri" <<'PY' +from __future__ import annotations + +import os +import sys +from urllib.parse import urlparse + +import boto3 + + +def main() -> None: + uri = sys.argv[1] + parsed = urlparse(uri) + if parsed.scheme != "s3" or not parsed.netloc: + raise RuntimeError(f"expected s3://bucket[/prefix], got: {uri!r}") + + bucket = parsed.netloc + prefix = parsed.path.lstrip("/") + endpoint_url = os.environ.get("AWS_ENDPOINT_URL") or None + + client = boto3.client("s3", endpoint_url=endpoint_url) + paginator = client.get_paginator("list_objects_v2") + + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + dt = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S") + size = obj["Size"] + key = obj["Key"] + print(f"{dt} {size:>10d} {key}") + + +if __name__ == "__main__": + main() +PY +} + +cleanup() { + if [[ -n "${moto_pid}" ]]; then + kill "${moto_pid}" >/dev/null 2>&1 || true + wait "${moto_pid}" >/dev/null 2>&1 || true + fi + rm -rf "${moto_dir}" + if [[ "${KEEP_EXAMPLE_SCRATCH:-0}" == "1" ]]; then + log "Keeping scratch directory: ${scratch_dir}" + return + fi + rm -rf "${scratch_dir}" +} +trap cleanup EXIT + +mkdir -p "${moto_dir}" +mkdir -p "${DML_CONFIG_HOME}" +dml_user="cool-guy" +dml config set --scope global user $dml_user + +log "Starting moto server and preparing env..." +python - "${moto_env_file}" >"${moto_log_file}" 2>&1 <<'PY' & +from __future__ import annotations + +import os +import shlex +import signal +import sys +import time +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +from moto.server import ThreadedMotoServer + + +def main() -> None: + env_file = Path(sys.argv[1]) + remote_root = os.environ.get("DML_EXAMPLE_REMOTE_ROOT", "s3://daggerml-example/artifacts") + parsed = urlparse(remote_root) + if parsed.scheme != "s3" or not parsed.netloc: + raise RuntimeError(f"DML_EXAMPLE_REMOTE_ROOT must be s3://bucket[/prefix], got: {remote_root!r}") + bucket = parsed.netloc + + server = ThreadedMotoServer(port=0, verbose=False) + server.start() + host, port = server.get_host_and_port() + endpoint = f"http://{host}:{port}" + + env_values = { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_SHARED_CREDENTIALS_FILE": "/dev/null", + "AWS_ENDPOINT_URL": endpoint, + "DML_REMOTE_ROOT": remote_root, + } + + for key, value in env_values.items(): + os.environ[key] = value + + boto3.client("s3", endpoint_url=endpoint).create_bucket(Bucket=bucket) + env_file.write_text("\n".join(f"export {k}={shlex.quote(v)}" for k, v in env_values.items()) + "\n") + + stop = False + + def _handle_signal(_signum, _frame): + nonlocal stop + stop = True + + signal.signal(signal.SIGTERM, _handle_signal) + signal.signal(signal.SIGINT, _handle_signal) + + while not stop: + time.sleep(0.2) + + server.stop() + + +if __name__ == "__main__": + main() +PY +moto_pid="$!" + +for _ in $(seq 1 100); do + if [[ -s "${moto_env_file}" ]]; then + break + fi + if ! kill -0 "${moto_pid}" >/dev/null 2>&1; then + log "Moto bootstrap failed. Log output:" >&2 + cat "${moto_log_file}" >&2 + exit 1 + fi + sleep 0.1 +done + +if [[ ! -s "${moto_env_file}" ]]; then + log "Timed out waiting for moto env file. Log output:" >&2 + cat "${moto_log_file}" >&2 + exit 1 +fi + +# shellcheck disable=SC1090 +source "${moto_env_file}" +log "Moto ready: ${AWS_ENDPOINT_URL}" +log "Remote root: ${DML_REMOTE_ROOT}" + +log "Setting up DML repo in ${ignore_dir}" +mkdir -p "${scratch_dir}" || true +printf '*\n' > "${ignore_dir}/.gitignore" +cd "${scratch_dir}" + +project0="project-0" +log "Initializing DML repo in ${project0}" +mkdir "${scratch_dir}/${project0}" +cd "${scratch_dir}/${project0}" +dml init --remote-project "dml://${dml_user}/${project0}" + +log "Configuring and inspecting CLI-visible settings" +pretty_dml config set --scope local remote.fetch_workers 2 +pretty_dml config get remote.root +pretty_dml config get remote.fetch_workers +pretty_dml config show +pretty_dml config show --contrib + +log "DML repo initialized. Current status" +pretty_dml status + +log "Running example: 00-hello_world.py" +python "${examples_dir}/00-hello_world.py" + +log "Inspecting committed history and DAG state after 00-hello_world.py" +pretty_dml branch +pretty_dml log --revision HEAD --limit 10 +pretty_dml show --revision HEAD +pretty_dml diff --left HEAD~1 --right HEAD +pretty_dml dag list +pretty_dml dag describe examples/00-hello-world +pretty_dml dag get examples/00-hello-world +pretty_dml dag describe-node greeting --dag examples/00-hello-world +pretty_dml dag get-node greeting --dag examples/00-hello-world +pretty_dml dag unroll-node greeting --dag examples/00-hello-world + +hello_dag_ref="$(dml dag describe examples/00-hello-world | jq -r '.dag.ref')" +hello_fn_ref="$(dml dag describe-node hello_fn --dag examples/00-hello-world | jq -r '.node.ref')" +greeting_ref="$(dml dag describe-node greeting --dag examples/00-hello-world | jq -r '.node.ref')" + +log "Exercising low-level runtime and admin CLI commands" +runtime_idx="$(dml runtime create | json_scalar)" +scratch_idx="$(dml runtime create | json_scalar)" +cancel_idx="$(dml runtime create | json_scalar)" +pretty_dml runtime list +pretty_dml runtime describe "${runtime_idx}" +pretty_dml admin index list +pretty_dml admin index get "${runtime_idx}" + +seed_ref="$(dml runtime put-literal "${runtime_idx}" cli-seed --name seed | json_scalar)" +imported_greeting_ref="$(dml runtime put-import "${runtime_idx}" "${hello_dag_ref}" --node "${greeting_ref}" --name imported-greeting | json_scalar)" +hello_runtime_ref="$(dml runtime put-import "${runtime_idx}" "${hello_dag_ref}" --node "${hello_fn_ref}" --name hello-fn | json_scalar)" +pretty_dml runtime get-node "${runtime_idx}" seed +pretty_dml runtime get-node "${runtime_idx}" imported-greeting + +pretty_dml runtime set-node-name "${runtime_idx}" cli-greeting-alias "${imported_greeting_ref}" +pretty_dml runtime get-node "${runtime_idx}" cli-greeting-alias +pretty_dml runtime describe "${runtime_idx}" +pretty_dml runtime cancel "${cancel_idx}" +pretty_dml admin index delete "${scratch_idx}" +pretty_dml admin gc --dry-run +pretty_dml admin gc + +log "Exercising top-level checkout workflows" +pretty_dml checkout HEAD~1 +pretty_dml status +pretty_dml checkout main +pretty_dml show --revision HEAD + +log "Listing DML refs after running all examples:" +s3_ls_recursive "${DML_REMOTE_ROOT}/dml/refs/projects/" +pretty_dml admin remote list --owner "${dml_user}" +pretty_dml push --create +pretty_dml push --tag cli-demo-tag +pretty_dml admin remote list +pretty_dml admin remote gc --min-age-seconds 0 --malformed warn +s3_ls_recursive "${DML_REMOTE_ROOT}/dml/refs/projects/" + +log "Cleaning up first project to test fresh init with existing remote" +cd .. && rm -rf "${project0}" + +## Second "project" +project1="project-1" +log "Initializing DML repo in ${project1}" +mkdir "${scratch_dir}/${project1}" +cd "${scratch_dir}/${project1}" +dml init --remote-project "dml://${dml_user}/${project1}" +pretty_dml fetch "dml://${dml_user}/${project0}" +pretty_dml branch --remote +pretty_dml dag checkout "dml://${dml_user}/${project0}#main" "examples/00-hello-world" --target-name examples/00-hello-world-copy +pretty_dml status +pretty_dml revert HEAD "${dml_user}" +pretty_dml merge "dml://${dml_user}/${project0}#main" "${dml_user}" +pretty_dml pull "dml://${dml_user}/${project0}" "${dml_user}" +pretty_dml dag checkout "dml://${dml_user}/${project0}#main" "examples/00-hello-world" +pretty_dml status + +log "Running example: 01b-load_fn.py" +python "${examples_dir}/01b-load_fn.py" + +log "Inspecting fetched and pulled history from the second project" +pretty_dml branch +pretty_dml branch --remote +pretty_dml log --revision HEAD --limit 10 +pretty_dml show --revision HEAD +pretty_dml dag list +pretty_dml dag describe examples/01b-load-fn --revision HEAD +pretty_dml dag get-node old_result --dag examples/01b-load-fn --revision HEAD + +log "All examples completed successfully." diff --git a/openspec/README.md b/openspec/README.md new file mode 100644 index 0000000..e61d366 --- /dev/null +++ b/openspec/README.md @@ -0,0 +1,10 @@ +# OpenSpec Workspace + +This `openspec/` tree is for maintainers and agents planning repository changes. +Use the `openspec` tool from within your agent harness to interact with this directory. + +- `changes/`: active and archived change proposals, designs, and task lists. +- `specs/`: current OpenSpec capability specs that describe repository behavior. +- `spec-overview.md`: a maintainer-facing map of which spec documents own which concepts. + +Use `docs/` for human-facing project documentation. diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/.openspec.yaml b/openspec/changes/archive/2026-04-25-adapter-s3-io/.openspec.yaml new file mode 100644 index 0000000..1b75776 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-25 diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/design.md b/openspec/changes/archive/2026-04-25-adapter-s3-io/design.md new file mode 100644 index 0000000..7c00b63 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/design.md @@ -0,0 +1,82 @@ +## Context + +Fire-and-monitor executors launch sub-adapters as detached processes (Docker containers, AWS Batch jobs, etc.) where no stdin/stdout pipe is possible. They need an alternative transport for the adapter input payload (normally stdin) and the adapter output result (normally stdout). + +The adapter CLI already supports `-i ` and `-o ` as alternatives to stdin/stdout, and `_read_input` already handles S3 URIs for input. The gap is: + +1. No standard S3 path convention for adapter I/O within the `fn-exec/` namespace. +2. No object that encapsulates derivation and access of those paths. +3. `_write_output` does not support S3 URIs — sub-adapters in remote environments cannot write their result back. +4. The `batch` executor has ad-hoc S3 I/O logic outside `fn-exec/`, using `S3Store.cd("jobs")` with random content-addressed keys, which is non-deterministic from the poller's perspective and unrelated to the execution record structure. +5. The `docker` executor works around the same problem with a local tmpdir volume mount, carrying `workdir` and `output_path` in its state — unnecessary machinery that ties the poller to the local filesystem. + +Among current executors, `docker` and `batch` are both migrated in this change. `script` pipes directly via supervisor, `ssh` forwards stdin/stdout over the SSH session, and `cfn` has no sub-adapter. + +`ExecutionState` already owns the `fn-exec/` namespace and has the raw S3 primitives needed. It is the correct home for this capability. + +## Goals / Non-Goals + +**Goals:** + +- Define a standard `fn-exec/io/{cache_key}/{exec_id}/{name}/` sub-namespace for adapter I/O. +- Provide `AdapterIO`, a scoped object derived from `ExecutionState`, with `input_uri`, `output_uri`, `write_input()`, and `read_output()`. +- Add S3 write support to `AdapterBase._write_output()` so sub-adapters can honor `-o `. +- Migrate `docker` executor to use `AdapterIO`, removing `workdir`, `output_path`, and tmpdir machinery from its state and cleanup. +- Migrate `batch` executor to use `AdapterIO`. + +**Non-Goals:** + +- Modifying `S3Store` (content-addressed artifact store — wrong abstraction here). +- Providing adapter I/O for executors that can pipe stdin/stdout directly (script, ssh). +- GC of `fn-exec/io/` objects (out of scope; same lifecycle as execution records, GC addressed separately). +- Any new executor beyond migrating `docker` and `batch`. + +## Decisions + +### `AdapterIO` lives in `exec_state.py` alongside `ExecutionState` + +`AdapterIO` is a scoped view into `ExecutionState`'s S3 primitives and namespace. It is not executor-specific and not part of the adapter contract itself — it is coordination infrastructure. Co-locating it with `ExecutionState` keeps the namespace ownership clear. + +Alternatives considered: +- `adapters.py`: rejected — that module is the sub-adapter side; it should not own S3 coordination paths. +- New `adapter_io.py` module: rejected — unnecessary split for a small, tightly coupled class. + +### Path: `fn-exec/io/{cache_key}/{exec_id}/{name}/` + +- `cache_key` groups all I/O for a given function execution, consistent with the rest of `fn-exec/`. +- `exec_id` is a UUID assigned per execution attempt, making each run's I/O unique even on retry. +- `name` is `"{adapter}:{executor}"` (e.g. `"lambda:batch"`), scoping I/O to the specific adapter/executor pair within a run. This avoids collisions when multiple adapters are involved in the same execution chain and makes the namespace self-documenting. + +Alternatives considered: +- Using `exec_number` instead of `exec_id`: `exec_id` is already threaded through both `start()` and `poll()`, so it is the natural key. `exec_number` requires an extra lookup. +- Flat `fn-exec/io/{exec_id}/`: loses the cache_key grouping, harder to GC or inspect by job. +- Outside `fn-exec/` (e.g. `jobs/`): breaks namespace ownership; `fn-exec/` is the authoritative coordination prefix. + +### `output_uri` is derivable, not stored in executor state + +Because `AdapterIO` derives its paths deterministically from `(cache_key, exec_id, name)` — all available in both `start()` and `poll()` — the poller can reconstruct the same `AdapterIO` instance and call `read_output()` without needing the URI in the persisted state dict. This removes `input_uri` and `output_uri` from the batch executor's state payload. + +### `name` is caller-defined, conventionally `{adapter-shorthand}:{executor}` + +`AdapterIO` does not enforce a naming scheme — callers pass whatever `name` string is appropriate for their context. Built-in executors use `"{adapter-shorthand}:{executor-name}"` (e.g. `"local:docker"`, `"lambda:batch"`). This convention is sufficient to avoid collisions within the `fn-exec/io/` namespace and makes paths self-documenting. Future executors should follow the same convention but are not required to. + +### Docker image tar tmpdir is ephemeral and cleaned up immediately + +`DockerExecutor._prepare_image` downloads an S3 image tar to a temporary directory for `docker load`. With this change, there is no longer a persistent workdir for I/O — the image tar tmpdir is created, used for `docker load`, and removed immediately after. It is not part of executor state. + +### `_write_output` S3 support uses a direct boto3 PUT + +`_read_input` uses `S3Store().get(uri)` because `S3Store.get` handles URI parsing and works without a prefix context. For output, `S3Store.put()` is content-addressed (key derived from SHA256 of data) and cannot write to a pre-determined URI. Rather than modifying `S3Store`, `_write_output` will parse the S3 URI directly and use a minimal boto3 `put_object` call — consistent with how `ExecutionState` handles its own S3 writes internally. + +Alternatives considered: +- Adding `put_at(uri, data)` to `S3Store`: rejected per explicit decision to not modify `S3Store` for this feature. +- Importing `ExecutionState` into `adapters.py` for the write: rejected — the sub-adapter side only needs a raw S3 PUT to a URI it was handed; it has no reason to know about `ExecutionState`. + +## Risks / Trade-offs + +- **No GC defined for `fn-exec/io/`** → These objects accumulate until a future GC pass is defined. Acceptable short-term; `fn-exec/` cleanup is a separate concern. +- **`batch` and `docker` executor state shapes change** → `batch` loses `input_uri` and `output_uri`; `docker` loses `workdir` and `output_path`. Any in-flight executions at deploy time with those keys in their recorded state will not break `poll()` (it no longer reads them from state), but old-format state will have unused keys. Clean cutover acceptable given no versioning contract on executor state internals. + +## Open Questions + +None. diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/proposal.md b/openspec/changes/archive/2026-04-25-adapter-s3-io/proposal.md new file mode 100644 index 0000000..fc79022 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/proposal.md @@ -0,0 +1,32 @@ +## Why + +Fire-and-monitor executors launch sub-adapters in environments where stdin/stdout piping is not possible — the sub-adapter runs as a detached process (Docker container, AWS Batch job, etc.) and the executor cannot hold open a pipe to it. Currently there is no standard way to pass the input payload to the sub-adapter or receive its output result. The `batch` executor works around this with ad-hoc S3 logic that is not generalized, not scoped to `fn-exec/`, and broken on the output side. The `docker` executor works around it with a local tmpdir volume mount, which is fragile and carries unnecessary state. Both are instances of the same problem. + +## What Changes + +- Add an `AdapterIO` class to `exec_state.py` that provides a scoped S3-backed stdin/stdout surrogate for a specific `(cache_key, exec_id, name)` triple. +- Add `ExecutionState.adapter_io(exec_id, name)` factory method returning an `AdapterIO` instance. +- Add S3 write support to `AdapterBase._write_output()` so sub-adapters running inside remote environments can write their result to an S3 URI passed via `-o`. +- Migrate the `docker` executor to use `AdapterIO`, replacing the local tmpdir volume mount approach and removing `workdir` and `output_path` from its state. +- Migrate the `batch` executor to use `AdapterIO` instead of its current ad-hoc S3 I/O logic, removing `input_uri` and `output_uri` from its state. +- Update `docs/contrib/executor-state.md` to document `AdapterIO`. + +Among current executors, `docker` and `batch` are the two that use `AdapterIO`. It is the intended standard pattern for any future fire-and-monitor executor (EMR, Glue, ECS, SageMaker, etc.). + +## Capabilities + +### New Capabilities + +- `adapter-s3-io`: S3-backed stdin/stdout surrogate (`AdapterIO`) for fire-and-monitor executors that cannot pipe data to/from a sub-adapter process directly. + +### Modified Capabilities + +- `execution-state`: `ExecutionState` gains a new `adapter_io()` factory method; the `fn-exec/io/` sub-namespace is added to the S3 layout it owns. + +## Impact + +- `src/daggerml/_internal/exec_state.py` — new `AdapterIO` class, new `ExecutionState.adapter_io()` method. +- `src/daggerml/contrib/adapters.py` — `_write_output()` gains S3 URI support. +- `src/daggerml/contrib/executors/docker.py` — migrated to `AdapterIO`; `workdir`, `output_path`, and tmpdir machinery removed from state and cleanup. +- `src/daggerml/contrib/executors/batch.py` — migrated to `AdapterIO`; `input_uri`, `output_uri`, and `S3Store.cd("jobs")` usage removed from state. +- `docs/contrib/executor-state.md` — updated to cover `AdapterIO` and `fn-exec/io/` namespace. diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/adapter-s3-io/spec.md b/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/adapter-s3-io/spec.md new file mode 100644 index 0000000..8c5b6a5 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/adapter-s3-io/spec.md @@ -0,0 +1,58 @@ +## ADDED Requirements + +### Requirement: AdapterIO provides scoped S3 stdin/stdout surrogate +The system SHALL provide an `AdapterIO` class that acts as an S3-backed surrogate for stdin/stdout between a fire-and-monitor adapter and the sub-adapter it launches. `AdapterIO` SHALL be constructed only via `ExecutionState.adapter_io(exec_id, name)` and SHALL scope all S3 paths under `{fn-exec-prefix}/io/{cache_key}/{exec_id}/{name}/`. + +#### Scenario: Paths scoped correctly +- **WHEN** `state.adapter_io("exec-uuid", "lambda:batch")` is called on an `ExecutionState` with `cache_key="ck"` and `remote_root="s3://bucket/pfx"` +- **THEN** `input_uri` is `s3://bucket/pfx/fn-exec/io/ck/exec-uuid/lambda:batch/input.json` and `output_uri` is `s3://bucket/pfx/fn-exec/io/ck/exec-uuid/lambda:batch/output.json` + +### Requirement: input_uri and output_uri are pure derivations +`AdapterIO.input_uri` and `AdapterIO.output_uri` SHALL be properties that return S3 URIs without performing any S3 operation. + +#### Scenario: No S3 call on property access +- **WHEN** `io.input_uri` or `io.output_uri` is accessed +- **THEN** no S3 API call is made + +### Requirement: write_input writes payload and returns input URI +`AdapterIO.write_input(data: bytes)` SHALL PUT `data` to the input S3 key and return `input_uri`. + +#### Scenario: write_input stores payload at input key +- **WHEN** `io.write_input(b'{"payload": 1}')` is called +- **THEN** the bytes are written to the input S3 key and `input_uri` is returned + +### Requirement: read_output returns output bytes or None +`AdapterIO.read_output()` SHALL GET the output S3 key and return the raw bytes. If the object does not yet exist, it SHALL return `None` without raising. + +#### Scenario: read_output returns None when not yet written +- **WHEN** `io.read_output()` is called before the sub-adapter has written output +- **THEN** `None` is returned + +#### Scenario: read_output returns bytes when written +- **WHEN** the sub-adapter has written its result to the output S3 key and `io.read_output()` is called +- **THEN** the raw bytes are returned + +### Requirement: name is caller-defined with a conventional format +The `name` parameter passed to `ExecutionState.adapter_io()` SHALL be chosen by the caller. Built-in executors SHALL use the convention `"{adapter-shorthand}:{executor-name}"` (e.g. `"local:docker"`, `"lambda:batch"`). `AdapterIO` SHALL NOT validate or interpret the `name` value. + +#### Scenario: name is incorporated into S3 path verbatim +- **WHEN** `state.adapter_io(exec_id, "local:docker")` is called +- **THEN** the resulting paths contain `local:docker` as a path component under `fn-exec/io/{cache_key}/{exec_id}/` + +### Requirement: AdapterIO is only for fire-and-monitor executors +`AdapterIO` SHALL only be used by executors that launch a sub-adapter as a detached process where direct stdin/stdout piping is not possible. Among current executors, `docker` and `batch` use `AdapterIO`. Executors that can pipe stdin/stdout directly (`script`, `ssh`) SHALL NOT use `AdapterIO`. `cfn` has no sub-adapter and SHALL NOT use `AdapterIO`. + +#### Scenario: docker executor uses AdapterIO +- **WHEN** `DockerExecutor.start()` is called +- **THEN** it uses `AdapterIO.write_input()` to write the sub-adapter payload and passes `io.input_uri` and `io.output_uri` to the container command; no local tmpdir is created for I/O + +#### Scenario: batch executor uses AdapterIO +- **WHEN** `BatchExecutor.start()` is called +- **THEN** it uses `AdapterIO.write_input()` to write the sub-adapter payload and passes `io.input_uri` and `io.output_uri` to the Batch container command + +### Requirement: Sub-adapter output written via _write_output S3 support +`AdapterBase._write_output` SHALL support S3 URIs as the output path, writing the result payload directly to the specified S3 key via `put_object`. + +#### Scenario: Sub-adapter writes to S3 output URI +- **WHEN** the adapter CLI is invoked with `-o s3://bucket/key` and execution completes +- **THEN** the result JSON is written to `s3://bucket/key` diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/execution-state/spec.md b/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/execution-state/spec.md new file mode 100644 index 0000000..883ff16 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/specs/execution-state/spec.md @@ -0,0 +1,15 @@ +## ADDED Requirements + +### Requirement: ExecutionState exposes adapter_io factory +`ExecutionState` SHALL provide an `adapter_io(exec_id: str, name: str) -> AdapterIO` method that returns a scoped `AdapterIO` instance for the given execution attempt and adapter/executor name. + +#### Scenario: adapter_io returns AdapterIO with correct scope +- **WHEN** `ExecutionState(cache_key, remote_root=...).adapter_io(exec_id, name)` is called +- **THEN** the returned `AdapterIO` instance derives all paths from `(cache_key, exec_id, name)` under the `fn-exec/io/` sub-namespace + +### Requirement: fn-exec/io/ sub-namespace is owned by ExecutionState +The system SHALL use `{fn-exec-prefix}/io/{cache_key}/{exec_id}/{name}/` as the standard S3 path for adapter I/O objects. This sub-namespace SHALL be owned by `ExecutionState`. + +#### Scenario: Adapter I/O paths are within fn-exec/ +- **WHEN** `AdapterIO` writes or derives any S3 key +- **THEN** all keys are prefixed with `{fn-exec-prefix}/io/` diff --git a/openspec/changes/archive/2026-04-25-adapter-s3-io/tasks.md b/openspec/changes/archive/2026-04-25-adapter-s3-io/tasks.md new file mode 100644 index 0000000..48801fa --- /dev/null +++ b/openspec/changes/archive/2026-04-25-adapter-s3-io/tasks.md @@ -0,0 +1,29 @@ +## 1. AdapterIO and ExecutionState + +- [x] 1.1 Add `AdapterIO` class to `src/daggerml/_internal/exec_state.py` with `input_uri`, `output_uri` properties, `write_input(data: bytes) -> str`, and `read_output() -> bytes | None` +- [x] 1.2 Add `ExecutionState.adapter_io(exec_id: str, name: str) -> AdapterIO` factory method +- [x] 1.3 Add tests for `AdapterIO` path derivation, `write_input`, and `read_output` + +## 2. Adapter CLI S3 Output Support + +- [x] 2.1 Add S3 write support to `AdapterBase._write_output()` in `src/daggerml/contrib/adapters.py` (parallel to existing S3 read support in `_read_input`) +- [x] 2.2 Add tests for `_write_output` with an S3 URI + +## 3. Migrate docker Executor + +- [x] 3.1 Update `DockerExecutor.start()` in `src/daggerml/contrib/executors/docker.py` to use `AdapterIO.write_input()` for the sub-adapter payload and pass `io.input_uri` / `io.output_uri` to the container command instead of mounting a local tmpdir +- [x] 3.2 Update `DockerExecutor.poll()` to reconstruct `AdapterIO` and use `io.read_output()` instead of reading a local `output_path` from state +- [x] 3.3 Remove `workdir` and `output_path` from `DockerExecutor` state; make `_prepare_image` tmpdir ephemeral (created and removed within `start()`) +- [x] 3.4 Add or update tests for `DockerExecutor` covering S3-backed I/O and the simplified state shape + +## 4. Migrate batch Executor + +- [x] 4.1 Update `BatchExecutor.start()` in `src/daggerml/contrib/executors/batch.py` to use `AdapterIO.write_input()` for the sub-adapter payload and pass `io.input_uri` / `io.output_uri` to the Batch container command +- [x] 4.2 Update `BatchExecutor.poll()` to reconstruct `AdapterIO` from `(cache_key, execution_id, name)` and use `io.read_output()` instead of reading `output_uri` from state +- [x] 4.3 Remove `input_uri`, `output_uri`, and `S3Store.cd("jobs")` usage from the batch executor +- [x] 4.4 Add or update tests for `BatchExecutor` covering S3-backed I/O and the simplified state shape + +## 5. Documentation + +- [x] 5.1 Update `docs/contrib/executor-state.md` to document `AdapterIO`, the `fn-exec/io/` sub-namespace, and the `adapter_io()` factory +- [x] 5.2 Update `docs/contrib/executor-catalog.md` entries for `docker` and `batch` to reflect S3-backed I/O and simplified state diff --git a/openspec/changes/archive/2026-04-25-exec-state-s3-backend/.openspec.yaml b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/.openspec.yaml new file mode 100644 index 0000000..204fc5a --- /dev/null +++ b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-18 diff --git a/openspec/changes/archive/2026-04-25-exec-state-s3-backend/design.md b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/design.md new file mode 100644 index 0000000..8ee3674 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/design.md @@ -0,0 +1,69 @@ +## Context + +`ExecutionState` is the distributed mutex for function execution in daggerml. Multiple processes sharing the same `remote_root` may concurrently call `start_fn` for the same `cache_key` — only one should drive the adapter at a time. + +Previously backed by DynamoDB with a rich state machine (`pending → running → succeeded/failed → done`), the new design strips this down: the lock file is a pure mutex, adapter-private state lives wherever the adapter chooses, and terminal results are communicated via adapter stdout. S3 is already required for all remote operations via `DML_REMOTE_ROOT`. + +S3 conditional writes (`If-None-Match: *` for create, DELETE for release) provide the mutex. S3 has been strongly consistent since December 2020. + +## Goals / Non-Goals + +**Goals:** +- Implement `ExecutionState` in `daggerml._internal.exec_state` as a pure S3 mutex. +- Lock file stored at `{remote_root_prefix}/exec/{cache_key}.json` containing only `{lock_token, lock_expires_ts}`. +- Lock lifecycle: **create** (`If-None-Match: *`) → **delete**. No updates ever. +- TTL on creation is a safety net for crashed processes only. +- Rewrite `start_fn` in `_internal/ops/index.py` to: check cache → lock → recheck cache → call adapter → unlock → return. +- Adapter stdout carries terminal result `{status, dag_id?, error?}`. +- Remove `DML_DYNAMODB_TABLE` from the execution path entirely. + +**Non-Goals:** +- Migrating `dml-util/aws/dynamodb.py`. +- Preserving the old `ExecutionState` API (`upsert`, `heartbeat`, `mark_*`, `claim_running`, etc.). +- Supervisor-based heartbeating (adapters must return quickly; long-running jobs manage their own external state). + +## Decisions + +### 1. Lock file = pure mutex, not state record + +The old design encoded job status, `dag_id`, `error`, `heartbeat_ts`, and metadata into the lock record. All of that is removed. Adapters that need persistent state (e.g. a batch job ID to poll) store it themselves in S3 under their own keys. Terminal result flows back via stdout, not the lock file. + +**Why:** Decouples the mutex from adapter-specific concerns. Each adapter is free to store whatever it needs without a shared schema. + +### 2. Lock lifecycle is create/delete only + +`lock()` = `PUT If-None-Match: *`. `unlock()` = `DELETE`. No ETag-based update, no heartbeat writes. The `lock_expires_ts` written at creation is the only TTL mechanism — if a process crashes, the next caller that sees an expired file can delete it and re-lock. + +**Alternatives considered:** +- ETag-based update for heartbeat — rejected; adapters return quickly, no heartbeat needed. +- Separate lock + state files — rejected; unnecessary complexity given the stripped model. + +### 3. `start_fn` rewritten around the mutex + +``` +1. check cache → hit? return node +2. lock() → FAILED? return None (another process is driving this cycle) +3. check cache again (post-lock) + → HIT? delete lock file, return node +4. call_adapter() → stdout: {status, dag_id?, error?} +5. if succeeded: publish to cache, delete lock + if failed: delete lock, raise + if running: delete lock, return None (adapter still working) +``` + +Steps 2–5 replace the old `upsert` + `claim_running` + `_mark_execution_done` dance. + +### 4. `contrib/executor_state.py` removed, not deprecated + +Since the API changes entirely (no `mark_*`, no `upsert`, no `heartbeat`), re-exporting a compatible shim is not meaningful. Existing contrib executors that used the old API must be rewritten to manage their own state. The module is deleted. + +## Risks / Trade-offs + +- **S3 conditional write requires boto3 ≥ 1.35.36** — `If-None-Match: *` silently dropped on older SDKs. Mitigation: document minimum version; moto always supports it in tests. +- **No TTL auto-expiry** — stale lock files persist until a caller checks and finds them expired. Mitigation: acceptable; next `lock()` call steals it. +- **Adapter must return quickly** — the lock is held for the duration of `call_adapter()`. A slow adapter blocks all other callers for that `cache_key`. Mitigation: this is a contract on adapter authors, documented explicitly. +- **Contrib executors need rewriting** — `batch.py`, `docker.py`, `script.py`, `cfn.py`, `_lambda.py`, `ssh.py`, `supervisor.py` all used the old `ExecutionState` API and must manage their own state going forward. + +## Open Questions + +- Should `lock()` steal an expired lock file via DELETE + re-PUT, or should it just DELETE and return `False` (forcing the caller to retry on the next `start_fn` invocation)? Recommendation: DELETE + re-PUT in one `lock()` call so the caller doesn't lose a cycle. diff --git a/openspec/changes/archive/2026-04-25-exec-state-s3-backend/proposal.md b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/proposal.md new file mode 100644 index 0000000..da1c736 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/proposal.md @@ -0,0 +1,31 @@ +## Why + +Execution state currently requires a DynamoDB table, adding operational overhead to every deployment. By moving execution state to S3 — which is already required for all other remote operations — the entire infrastructure footprint becomes a single S3 prefix, simplifying setup and enabling atomic cache semantics via S3 conditional writes. + +## What Changes + +- **New module** `daggerml._internal.exec_state`: S3-backed `ExecutionState` with advisory locking via S3 conditional writes (`If-None-Match`/`If-Match` on ETags), replacing the DynamoDB implementation. +- **State objects** are stored at `{remote_root_prefix}/exec/{cache_key}.json`, sibling to `refs/`, since they reference internal DAG refs in their payload. +- **`daggerml._internal.ops.index`** updated to import `ExecutionState` from `_internal.exec_state` instead of `daggerml.contrib.executor_state`. +- **`daggerml.contrib.executor_state`** deprecated; callers in `contrib` updated to use the new internal module. +- **Removed dependency** on `DML_DYNAMODB_TABLE` environment variable and DynamoDB boto3 client in the execution path. +- **Test infrastructure** updated: moto DynamoDB table fixture replaced with moto S3 fixture (already present for other tests). + +## Capabilities + +### New Capabilities + +- `execution-state`: S3-backed execution state record with advisory locking, heartbeat, and status transitions (`pending → running → succeeded/failed → done`), accessed via `remote_root` string rather than a DynamoDB table name. + +### Modified Capabilities + + + +## Impact + +- **`src/daggerml/_internal/ops/index.py`**: change import source for `ExecutionState`; pass `remote_root` instead of table name. +- **`src/daggerml/contrib/executor_state.py`**: deprecated (kept for backwards compatibility or removed). +- **`src/daggerml/contrib/executors/batch.py`**, **`docker.py`**: no longer need to forward `DML_DYNAMODB_TABLE` to containers. +- **Tests**: `tests/conftest.py`, `tests/contrib/test_executor_state.py`, `tests/contrib/test_executor_base.py` — swap DynamoDB moto fixtures for S3. +- **Docs**: `docs/contrib/executor-state.md`, `executor-catalog.md`, `execution-graph.md` — update infra requirements. +- **No public API change** for callers of `IndexOps`; the `ExecutionState` type is internal. diff --git a/openspec/changes/archive/2026-04-25-exec-state-s3-backend/specs/execution-state/spec.md b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/specs/execution-state/spec.md new file mode 100644 index 0000000..8e091cc --- /dev/null +++ b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/specs/execution-state/spec.md @@ -0,0 +1,87 @@ +## ADDED Requirements + +### Requirement: S3-backed mutex lock file +The system SHALL store a lock file at `{remote_root_prefix}/exec/{cache_key}.json` containing only `{lock_token: str, lock_expires_ts: float}`. No status, metadata, or job-specific fields. + +#### Scenario: Lock file written to correct S3 key +- **WHEN** `ExecutionState(cache_key, remote_root="s3://bucket/prefix").lock()` succeeds +- **THEN** a JSON object is written to `s3://bucket/prefix/exec/{cache_key}.json` + +#### Scenario: No DynamoDB dependency +- **WHEN** any `ExecutionState` method is called +- **THEN** no DynamoDB client is created and `DML_DYNAMODB_TABLE` is not read + +### Requirement: Lock acquired via create-if-absent +The system SHALL acquire the lock by PUT with `If-None-Match: *`. If the object already exists and its `lock_expires_ts` has not passed, `lock()` SHALL return `False`. If the existing lock is expired, the system SHALL DELETE it and re-PUT, returning `True`. + +#### Scenario: Lock acquired when no file exists +- **WHEN** `lock()` is called and no lock file exists at the key +- **THEN** the file is created with a fresh `lock_token` and `lock_expires_ts`, and `True` is returned + +#### Scenario: Lock refused when held and not expired +- **WHEN** `lock()` is called and a non-expired lock file exists +- **THEN** `False` is returned and the file is unchanged + +#### Scenario: Expired lock is stolen +- **WHEN** `lock()` is called and an expired lock file exists +- **THEN** the old file is deleted, a new one is created, and `True` is returned + +#### Scenario: Concurrent create conflict returns False +- **WHEN** the `If-None-Match: *` PUT returns `412 PreconditionFailed` +- **THEN** `False` is returned without raising + +### Requirement: Lock released via DELETE +The system SHALL release the lock by DELETE of the lock file. No updates to the file are ever made. + +#### Scenario: Unlock deletes the file +- **WHEN** `unlock()` is called by the lock holder +- **THEN** the lock file is deleted from S3 + +#### Scenario: Unlock is idempotent +- **WHEN** `unlock()` is called and the file does not exist +- **THEN** no error is raised + +### Requirement: start_fn mutex-gated adapter dispatch +`IndexOps.start_fn` SHALL implement the following flow on every call: +1. Check cache — return node if hit. +2. Attempt `lock()` — return `None` if failed. +3. Recheck cache — if hit, delete lock file and return node. +4. Call adapter (must return quickly); adapter stdout carries `{status, dag_id?, error?}`. +5. On `succeeded`: publish result to cache, delete lock file. +6. On `failed`: delete lock file, raise. +7. On `running`: delete lock file, return `None`. + +#### Scenario: Cache hit before lock returns node immediately +- **WHEN** `start_fn` is called and the cache already contains a result +- **THEN** the node is returned without acquiring the lock + +#### Scenario: Lock contention returns None +- **WHEN** `start_fn` is called and another process holds the lock +- **THEN** `None` is returned so the caller retries + +#### Scenario: Cache hit after lock cleans up and returns node +- **WHEN** `start_fn` acquires the lock but finds a cache hit on recheck +- **THEN** the lock file is deleted and the cached node is returned + +#### Scenario: Adapter success publishes cache and releases lock +- **WHEN** the adapter returns `status: succeeded` with a `dag_id` +- **THEN** the result is published to cache and the lock file is deleted + +#### Scenario: Adapter failure releases lock and raises +- **WHEN** the adapter returns `status: failed` +- **THEN** the lock file is deleted and a `DmlRepoError` is raised + +#### Scenario: Adapter still running releases lock and returns None +- **WHEN** the adapter returns `status: running` +- **THEN** the lock file is deleted and `None` is returned + +### Requirement: ExecutionState constructed from remote_root +The system SHALL accept `remote_root: str` as its sole configuration parameter. + +#### Scenario: remote_root parsed to bucket and prefix +- **WHEN** `ExecutionState(cache_key, remote_root="s3://my-bucket/my/prefix")` is constructed +- **THEN** lock operations target `s3://my-bucket/my/prefix/exec/{cache_key}.json` + +#### Scenario: Missing remote_root raises error +- **WHEN** `ExecutionState` is constructed without a valid `remote_root` +- **THEN** a `DmlRepoError` is raised with a descriptive message diff --git a/openspec/changes/archive/2026-04-25-exec-state-s3-backend/tasks.md b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/tasks.md new file mode 100644 index 0000000..d0844a2 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-exec-state-s3-backend/tasks.md @@ -0,0 +1,59 @@ +## 1. New Module: `_internal/exec_state.py` + +- [x] 1.1 Create `src/daggerml/_internal/exec_state.py` with `LockRecord` TypedDict `{lock_token: str, lock_expires_ts: float}` and `LOCK_TTL` constant +- [x] 1.2 Implement `ExecutionState.__init__(cache_key, *, remote_root)` — parse `s3://bucket/prefix`, derive key `{prefix}/exec/{cache_key}.json`; raise `DmlRepoError` if invalid +- [x] 1.3 Implement `ExecutionState.lock(ttl)` — GET existing file; if absent: PUT with `If-None-Match: *`; if expired: DELETE then PUT; if held: return `False`; return `True` on success +- [x] 1.4 Implement `ExecutionState.unlock()` — DELETE the lock file; no-op if already absent +- [x] 1.5 Handle `412 PreconditionFailed` from S3 as `False` return (not an exception) in `lock()` + +## 2. Tests for `exec_state.py` + +- [x] 2.1 Create `tests/test_exec_state.py` with moto S3 fixture (reuse pattern from `tests/conftest.py`) +- [x] 2.2 Test `lock()` creates file when absent, returns `True` +- [x] 2.3 Test `lock()` returns `False` when non-expired lock exists +- [x] 2.4 Test `lock()` steals expired lock (DELETE + re-PUT), returns `True` +- [x] 2.5 Test `lock()` returns `False` on `412` concurrent conflict +- [x] 2.6 Test `unlock()` deletes the file +- [x] 2.7 Test `unlock()` is idempotent when file absent +- [x] 2.8 Test missing/invalid `remote_root` raises `DmlRepoError` + +## 3. Rewrite `start_fn` in `_internal/ops/index.py` + +- [x] 3.1 Remove lazy import of `contrib.executor_state.ExecutionState`; import from `daggerml._internal.exec_state` +- [x] 3.2 Replace `upsert` + `_call_adapter` + `get` + `_publish_terminal_state` + `_mark_execution_done` with the new mutex-gated flow: + - check cache → lock → recheck cache → call_adapter → handle result +- [x] 3.3 Update `_call_adapter` to parse stdout `{status, dag_id?, error?}` and return it +- [x] 3.4 On `succeeded`: call `CacheOps.put`, then `unlock()` +- [x] 3.5 On `failed`: call `unlock()`, raise `DmlRepoError` +- [x] 3.6 On `running`: call `unlock()`, return `None` +- [x] 3.7 Post-lock cache hit: DELETE lock file, return node + +## 4. Rewrite contrib executors to manage their own state + +- [x] 4.1 `contrib/executors/batch.py` — remove `ExecutionState` usage; store job state (job ID, status) in adapter-owned S3 key; return `{status, dag_id?, error?}` via stdout +- [x] 4.2 `contrib/executors/docker.py` — same as 4.1 +- [x] 4.3 `contrib/executors/script.py` — same as 4.1 +- [x] 4.4 `contrib/executors/cfn.py` — same as 4.1 +- [x] 4.5 `contrib/executors/_lambda.py` — same as 4.1 +- [x] 4.6 `contrib/executors/ssh.py` — same as 4.1 +- [x] 4.7 `contrib/supervisor.py` — remove `ExecutionState` usage; supervisor result written to a local file read by adapter on next call +- [x] 4.8 Remove `DML_DYNAMODB_TABLE` forwarding from `batch.py` and `docker.py` + +## 5. Delete `contrib/executor_state.py` + +- [x] 5.1 Delete `src/daggerml/contrib/executor_state.py` +- [x] 5.2 Remove `ExecutionRecord`, `ExecutionState` imports from all contrib files + +## 6. Update tests and fixtures + +- [x] 6.1 Remove DynamoDB moto fixture from `tests/conftest.py` (`test-dml-state` table, `DML_DYNAMODB_TABLE` env var) +- [x] 6.2 Delete `tests/contrib/test_executor_state.py` (superseded by `tests/test_exec_state.py`) +- [x] 6.3 Remove DynamoDB fixture dependency from `tests/contrib/test_executor_base.py` +- [x] 6.4 Remove `DML_DYNAMODB_TABLE` propagation from `tests/contrib/test_ssh_integration.py` +- [x] 6.5 Update `tests/contrib/test_docker_executor.py`: remove `DML_DYNAMODB_TABLE` assertion + +## 7. Update documentation + +- [x] 7.1 Update `docs/contrib/executor-state.md`: describe mutex-only model, remove DynamoDB table requirement +- [x] 7.2 Update `docs/contrib/executor-catalog.md`: remove `DML_DYNAMODB_TABLE` from required env vars; document adapter stdout contract +- [x] 7.3 Update `docs/contrib/execution-graph.md`: reflect S3-only backend and simplified lock lifecycle diff --git a/openspec/changes/archive/2026-04-25-is-node-like-predicate/.openspec.yaml b/openspec/changes/archive/2026-04-25-is-node-like-predicate/.openspec.yaml new file mode 100644 index 0000000..1b75776 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-is-node-like-predicate/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-25 diff --git a/openspec/changes/archive/2026-04-25-is-node-like-predicate/design.md b/openspec/changes/archive/2026-04-25-is-node-like-predicate/design.md new file mode 100644 index 0000000..8b1f3ec --- /dev/null +++ b/openspec/changes/archive/2026-04-25-is-node-like-predicate/design.md @@ -0,0 +1,54 @@ +## Context + +The contrib executor subsystem (`SshExecutor`, `DockerExecutor`, etc.) validates kwargs before building execution commands. Some kwargs may be provided as "live" values (`Node` — a resolved DAG node) or as "deferred" values (`DelayedRef`, `DelayedLoad`, `DelayedRunnable` — values that are only resolved at DAG execution time). Currently `SshExecutor._validate_kw` checks for deferred values using `isinstance(x, DelayedActionCodec)`, which is the codec wrapper, not the actual user-facing `Delayed*` types. There is no shared predicate, so each executor must replicate the pattern, risking drift. + +All `Delayed*` types and `Node` (public) are defined in `src/daggerml/contrib/api.py` and `src/daggerml/api.py` respectively. + +## Goals / Non-Goals + +**Goals:** +- Add a single `is_node_like(x)` predicate in `src/daggerml/contrib/api.py` that returns `True` for `Node | DelayedRef | DelayedLoad | DelayedRunnable` +- Update `SshExecutor._validate_kw` to use `is_node_like` for its per-field checks +- Export `is_node_like` so other modules can import it + +**Non-Goals:** +- Refactoring all other executor validators (DockerExecutor, ScriptExecutor, BatchExecutor) in this change +- Changing the behavior of `DelayedActionCodec` or any codec logic +- Adding `is_node_like` to the internal `_internal/types.py` layer + +## Decisions + +**Where to define `is_node_like`** + +Place in `src/daggerml/contrib/api.py`, alongside the `Delayed*` type definitions. + +Alternatives considered: +- `src/daggerml/api.py` — only knows about `Node`, not `Delayed*`; would require importing contrib types into core API (wrong direction) +- `src/daggerml/_internal/types.py` — lowest-level home, but `Delayed*` types are in contrib and should not leak into internal +- Separate utils module — unnecessary indirection for a one-liner predicate + +**Predicate signature** + +```python +def is_node_like(x: object) -> bool: + return isinstance(x, (Node, DelayedRef, DelayedLoad, DelayedRunnable)) +``` + +Simple, no ABC or protocol needed at this stage. + +**SshExecutor import** + +`SshExecutor` already imports from `daggerml.contrib.api`; add `is_node_like` to that import. + +## Risks / Trade-offs + +- [Risk] Future `Delayed*` types added without updating `is_node_like` → Mitigation: keep the predicate next to the type definitions so it is easy to spot during code review +- [Trade-off] Not updating other executors now keeps the change small and reviewable; they remain inconsistent for now + +## Migration Plan + +No data migration needed. Change is purely additive (new function) plus a one-line update in `_validate_kw`. No deprecation or rollback concern. + +## Open Questions + +- Should `is_node_like` also cover `MockNode` from `contrib/testing.py`? Current answer: no — `MockNode` is test infrastructure, not a production node-like type. Can be revisited. diff --git a/openspec/changes/archive/2026-04-25-is-node-like-predicate/proposal.md b/openspec/changes/archive/2026-04-25-is-node-like-predicate/proposal.md new file mode 100644 index 0000000..57b6a58 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-is-node-like-predicate/proposal.md @@ -0,0 +1,21 @@ +## Why + +Executor validation logic (e.g., `SshExecutor._validate_kw`) currently checks against `DelayedActionCodec` directly, which is an internal codec type rather than the user-facing `Delayed*` types (`DelayedRef`, `DelayedLoad`, `DelayedRunnable`). There is no shared predicate for "is this value a node or a delayed node-like value", so each executor duplicates the pattern inline and may get it wrong or inconsistently. + +## What Changes + +- Introduce a shared `is_node_like(x)` predicate function that returns `True` if `x` is an instance of `Node` or any of the `Delayed*` types (`DelayedRef`, `DelayedLoad`, `DelayedRunnable`). +- Update `SshExecutor._validate_kw` (and any other executor validation that checks for `Node`-or-deferred values) to use `is_node_like` instead of inline `isinstance` checks. + +## Capabilities + +### New Capabilities +- `is-node-like-predicate`: A shared predicate `is_node_like(x)` in the contrib API module that identifies values acceptable as node-like (i.e., `Node` or any `Delayed*` type), for use in executor validation and elsewhere. + +### Modified Capabilities + +## Impact + +- `src/daggerml/contrib/api.py` — add `is_node_like` function +- `src/daggerml/contrib/executors/ssh.py` — update `_validate_kw` to use `is_node_like` +- Other executors (`docker.py`, `script.py`, `batch.py`) may also benefit but are out of scope for this change diff --git a/openspec/changes/archive/2026-04-25-is-node-like-predicate/specs/is-node-like-predicate/spec.md b/openspec/changes/archive/2026-04-25-is-node-like-predicate/specs/is-node-like-predicate/spec.md new file mode 100644 index 0000000..2c7a687 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-is-node-like-predicate/specs/is-node-like-predicate/spec.md @@ -0,0 +1,43 @@ +## ADDED Requirements + +### Requirement: is_node_like predicate exists in contrib API +The system SHALL provide a public function `is_node_like(x)` in `daggerml.contrib.api` that returns `True` if and only if `x` is an instance of `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable`. + +#### Scenario: Node instance is node-like +- **WHEN** `is_node_like(x)` is called with a `Node` instance +- **THEN** it returns `True` + +#### Scenario: DelayedRef is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedRef` instance +- **THEN** it returns `True` + +#### Scenario: DelayedLoad is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedLoad` instance +- **THEN** it returns `True` + +#### Scenario: DelayedRunnable is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedRunnable` instance +- **THEN** it returns `True` + +#### Scenario: Plain value is not node-like +- **WHEN** `is_node_like(x)` is called with a plain Python value (str, int, list, None, etc.) +- **THEN** it returns `False` + +#### Scenario: DelayedActionCodec is not node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedActionCodec` instance (the internal codec wrapper) +- **THEN** it returns `False` + +### Requirement: SshExecutor uses is_node_like for field validation +`SshExecutor._validate_kw` SHALL use `is_node_like` to accept node-like values for the `host` and `flags` fields instead of checking `isinstance(x, DelayedActionCodec)` directly. + +#### Scenario: Node-like host passes validation +- **WHEN** `_validate_kw` is called with `host` set to a `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable` +- **THEN** validation passes without error + +#### Scenario: Node-like flags passes validation +- **WHEN** `_validate_kw` is called with `flags` set to a `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable` +- **THEN** validation passes without error + +#### Scenario: Invalid host still raises error +- **WHEN** `_validate_kw` is called with `host` set to an empty string or a non-node-like non-string +- **THEN** a `DmlRepoError` is raised diff --git a/openspec/changes/archive/2026-04-25-is-node-like-predicate/tasks.md b/openspec/changes/archive/2026-04-25-is-node-like-predicate/tasks.md new file mode 100644 index 0000000..665970b --- /dev/null +++ b/openspec/changes/archive/2026-04-25-is-node-like-predicate/tasks.md @@ -0,0 +1,15 @@ +## 1. Add is_node_like predicate + +- [x] 1.1 In `src/daggerml/contrib/api.py`, add `is_node_like(x: object) -> bool` after the `Delayed*` class definitions, returning `isinstance(x, (Node, DelayedRef, DelayedLoad, DelayedRunnable))` +- [x] 1.2 Export `is_node_like` in any relevant `__all__` or public re-export in `contrib/api.py` + +## 2. Update SshExecutor validation + +- [x] 2.1 In `src/daggerml/contrib/executors/ssh.py`, add `is_node_like` to the import from `daggerml.contrib.api` +- [x] 2.2 In `SshExecutor._validate_kw`, replace `isinstance(host, DelayedActionCodec)` with `is_node_like(host)` +- [x] 2.3 In `SshExecutor._validate_kw`, replace `isinstance(flags, DelayedActionCodec)` with `is_node_like(flags)` + +## 3. Verify + +- [x] 3.1 Run existing tests to confirm no regressions (`pytest` or equivalent) +- [x] 3.2 Confirm `is_node_like` returns `False` for `DelayedActionCodec` instances (manual check or test) diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/.openspec.yaml b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/.openspec.yaml new file mode 100644 index 0000000..8b394c6 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-23 diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/design.md b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/design.md new file mode 100644 index 0000000..d122664 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/design.md @@ -0,0 +1,161 @@ +## Context + +The current execution flow uses `cache_key` as both the cache identity and the practical lookup key for multiple runtime concerns: lock ownership, executor-private resumable state, and some nested transport handoff behavior. That spreads state across runtime-owned and executor-owned prefixes and makes stale-lock recovery, nested execution behavior, and observability difficult to reason about. + +This change introduces a clearer split between identities and responsibilities. `cache_key` remains the stable identity for the computation and cache entry. `execution_id` becomes the stable identity for one in-flight execution attempt. The runtime owns lock state, active execution pointers, immutable execution records, and call-edge lineage indexes. Adapters receive enough information to launch or poll, but no longer own the durable execution-state layout. + +This is a cross-cutting change because it affects runtime execution flow, adapter envelopes, executor resumability assumptions, failure publication, and S3 data layout. + +## Goals / Non-Goals + +**Goals:** +- Separate computation identity (`cache_key`) from in-flight execution identity (`execution_id`). +- Make the runtime authoritative for durable execution ownership and stale-lock recovery. +- Create immutable execution records that capture the launch-time durable state required for all later polls. +- Remove `pending` and simplify adapter status handling to `running|succeeded|failed`. +- Publish failed executions into cache after completing the DAG with the error. +- Persist caller/callee lineage for both user-dag to fn-dag calls and fn-dag to fn-dag calls in both query directions. + +**Non-Goals:** +- Tracking multiplicity or per-attempt history for repeated caller/callee edges. +- Defining a global append-only call log. +- Preserving executor-owned resumable-state prefixes for backwards compatibility. +- Introducing mutable execution records or heartbeat-updated execution metadata. + +## Decisions + +### 1. Runtime-owned active pointer and immutable execution record + +The runtime will store: + +- `active/`: plain-text `execution_id` +- `exec/.json`: immutable execution record created only on the first non-terminal adapter result for that execution + +The execution record stores the launch-time durable state returned by the adapter and is never updated. Later polls reuse that stored state. + +Why: +- Separates the stable cache identity from the in-flight execution identity. +- Avoids executor-private durable-state layouts. +- Makes stale-lock recovery resume the existing execution rather than invent a new one. + +Alternatives considered: +- Reusing `cache_key` as the only durable identity: rejected because it conflates cache lookup, locking, and execution resumption. +- Mutable execution records: rejected because the design goal is a one-time persisted launch snapshot with simpler reasoning. + +### 2. Lock recovery preserves execution identity + +`start_fn` still locks by `cache_key`, but stale-lock recovery must preserve the current `execution_id` whenever `active/` points to an existing execution record. + +Why: +- Lock ownership is transient coordination. +- Execution identity is the durable handle for the in-flight attempt. +- A stale lock should never silently fork a duplicate execution if resumable state exists. + +Alternatives considered: +- Creating a new execution on stale-lock recovery: rejected because it risks duplicate launches for the same computation. + +### 3. Adapter contract includes `execution_id` and initial `state` + +The adapter envelope includes `execution_id` and `state`. + +- First call for a new execution passes `state = null`. +- Resume calls pass the immutable stored state from `exec/.json`. + +Adapters may return `state` on later calls, but the runtime ignores replacement state after the execution record is created. + +Why: +- Adapters may need a stable execution-scoped identifier for external naming or storage. +- The first launch call is the right moment to return all durable resume handles. +- Ignoring replacement state keeps the runtime model simple and forces launch-time completeness. + +Alternatives considered: +- Omitting `execution_id` from the adapter envelope: rejected because adapters may need an execution-scoped namespace distinct from `cache_key`. +- Persisting updated adapter state on every poll: rejected because it reintroduces mutable execution-state complexity. + +### 4. Adapter statuses reduce to `running|succeeded|failed` + +`pending` is removed. New executions either: + +- return `running` with durable launch state, +- return `succeeded` with `dag_id`, or +- return `failed` with `error`. + +Why: +- There is no longer a separate runtime notion of pre-launch in-flight state. +- The first call is expected to launch the work or complete synchronously. + +Alternatives considered: +- Keeping `pending`: rejected because it does not carry unique semantics in the new model. + +### 5. Failed execution is cached after DAG error completion + +On adapter `failed`, the runtime completes the DAG with the error and publishes that failed outcome to cache, mirroring success publication. + +Why: +- Failure is still a terminal result for a specific computation. +- Caching terminal failures avoids repeated duplicate launches for deterministic failures. + +Alternatives considered: +- Leaving failures uncached: rejected because it preserves duplicate work and ambiguous terminal state. + +### 6. Call-edge lineage uses per-caller and per-callee list objects + +The runtime stores: + +- `calls/from/index/.json` -> sorted, deduped list of callee cache keys +- `calls/from/cache/.json` -> sorted, deduped list of callee cache keys +- `calls/to/cache/.json` -> object with sorted, deduped `indexes` and `cache_keys` lists + +Definitions: +- user-dag: a DAG without an `argv` node and therefore without a cache key +- fn-dag: a DAG with an `argv` node and therefore with a cache key + +Edges are written only on the new-execution path, immediately after the lock is acquired and inactive state is confirmed. + +Why: +- User-dags can be callers but never callees, so their forward index can stay separate and simple. +- Reverse lookup for a callee must support mixed caller types. +- Writing once on new execution avoids duplicate lineage writes during resumes. + +Alternatives considered: +- Per-edge objects: rejected for now in favor of fewer S3 objects and simpler query layout. +- Writing lineage only on success: rejected because lineage should represent attempted invocation, not only successful completion. + +### 7. Edge files update via read/merge/retry with ETag checks + +Each edge update performs: + +1. read current object, +2. merge new member, +3. dedupe and sort, +4. conditional write with ETag, +5. retry the full sequence on conflict. + +Why: +- This preserves correctness under concurrent writers without introducing a separate index service. + +Alternatives considered: +- Blind overwrite: rejected because it loses concurrent updates. + +## Risks / Trade-offs + +- Immutable execution records require adapters to return all durable resume handles on the first launch call -> Mitigation: make that a hard adapter contract and validate it in executor tests. +- Terminal failure caching may preserve transient infrastructure failures longer than desired -> Mitigation: scope cache identity and retry policy carefully, and document that only deterministic adapter failures should be surfaced as terminal results. +- List-based edge indexes can see write contention on high fan-in/fan-out cache keys -> Mitigation: use ETag-based full retries and accept that per-edge objects remain a future escape hatch. +- Stale `active/` pointers could block reuse if execution records are deleted or corrupted -> Mitigation: treat active pointers with missing execution records as stale and delete them before proceeding. +- Execution records and call-edge indexes add new long-lived S3 objects -> Mitigation: keep payloads minimal and rely on deterministic naming for inspection and cleanup tooling. + +## Migration Plan + +1. Introduce the new adapter envelope and result validation rules. +2. Implement runtime support for lock recovery, active pointers, immutable execution records, and failed-result cache publication. +3. Update contrib adapters and executors to return launch-time durable state on first `running` result and to ignore persisted mutable executor-owned state. +4. Add call-edge persistence on the new-execution path. +5. Remove obsolete executor-owned resumable-state usage and documentation. +6. Backfill or discard older in-flight state as a one-time compatibility break; no migration of legacy execution records is required. + +## Open Questions + +- Whether execution records should be retained after terminal success/failure or garbage-collected once the cache entry and active pointer are resolved. +- Whether runtime responses for a live lock should remain `None`-like or become a richer descriptive non-terminal result surfaced to callers. +- Whether failed-result cache publication should distinguish deterministic execution failures from infrastructure/transient failures in the cached payload. diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/proposal.md b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/proposal.md new file mode 100644 index 0000000..52a514c --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/proposal.md @@ -0,0 +1,34 @@ +## Why + +Execution state is currently spread across lock files, executor-specific state prefixes, executor-private storage layouts, and local scratch paths, which makes execution ownership and stale-state recovery difficult to reason about. Nested execution also needs durable caller/callee lineage so the runtime can answer which user-dags and fn-dags invoked a given cache-keyed function. + +## What Changes + +- Replace the current mutable, executor-owned execution-state model with a runtime-owned model built around: + - a lock per `cache_key`, + - a plain-text active pointer from `cache_key` to `execution_id`, + - an immutable execution record created on the first adapter call for an execution. +- Extend the adapter envelope to include `execution_id` and the initial persisted execution `state`. +- Simplify adapter result states by removing `pending`; adapters return `running`, `succeeded`, or `failed`. +- Require that adapters return all durable resume state on the first launch call for an execution; later polls reuse the immutable stored state and ignore newly returned state. +- Treat failed executions as terminal cached results by completing the DAG with the error and publishing that failed outcome to cache. +- Add S3-backed call-edge indexes for both directions: + - user-dag (`index_id`) to callee `cache_key`, + - fn-dag caller `cache_key` to callee `cache_key`, + - reverse lookup from callee `cache_key` to both index and fn-dag callers. +- Update stale-lock handling so recovery keeps the active `execution_id` when resumable execution state still exists. + +## Capabilities + +### New Capabilities +- `runtime-execution-records`: Runtime-owned execution identity, immutable execution records, active execution pointers, and adapter envelope/result semantics for resumable execution. +- `execution-call-edges`: Queryable caller/callee lineage indexes between user-dags, fn-dags, and callee cache keys. + +### Modified Capabilities + +## Impact + +- Affected code will include execution flow in `src/daggerml/_internal/ops/index.py`, execution-state helpers, adapter payload validation, and contrib executor/adapters that currently own resumable state. +- S3 layout will gain runtime-owned `active/`, `exec/`, and `calls/` objects, alongside updated lock handling. +- Adapter/executor contracts and docs will change to reflect immutable execution records, `execution_id` propagation, removal of `pending`, and failed-result cache publication. +- Tests will need to cover stale-lock recovery, immutable execution records, active pointer lifecycle, adapter payload/result validation, and bidirectional call-edge updates. diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/execution-call-edges/spec.md b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/execution-call-edges/spec.md new file mode 100644 index 0000000..e1e7282 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/execution-call-edges/spec.md @@ -0,0 +1,63 @@ +## ADDED Requirements + +### Requirement: Runtime SHALL distinguish user-dags from fn-dags for call-edge tracking +For lineage tracking, a user-dag SHALL mean a DAG without an `argv` node and therefore without a cache key. An fn-dag SHALL mean a DAG with an `argv` node and therefore with a cache key. + +#### Scenario: User-dag caller has no cache key +- **WHEN** the runtime records a call edge from a DAG that does not have an `argv` node +- **THEN** it SHALL treat that caller as a user-dag identified by `index_id` +- **AND** it SHALL NOT require a caller cache key for that edge + +#### Scenario: Fn-dag caller uses cache key identity +- **WHEN** the runtime records a call edge from a DAG that has an `argv` node +- **THEN** it SHALL treat that caller as an fn-dag identified by its caller cache key + +### Requirement: Runtime SHALL persist forward call-edge indexes by caller type +The runtime SHALL persist sorted, deduped forward indexes for attempted calls at: + +- `calls/from/index/.json` containing a list of callee cache keys for user-dag callers +- `calls/from/cache/.json` containing a list of callee cache keys for fn-dag callers + +#### Scenario: User-dag forward lineage is recorded +- **WHEN** a user-dag initiates a new execution for callee cache key `ck1` +- **THEN** the runtime SHALL add `ck1` to `calls/from/index/.json` + +#### Scenario: Fn-dag forward lineage is recorded +- **WHEN** an fn-dag with caller cache key `ck0` initiates a new execution for callee cache key `ck1` +- **THEN** the runtime SHALL add `ck1` to `calls/from/cache/ck0.json` + +### Requirement: Runtime SHALL persist reverse call-edge indexes for callee cache keys +For each callee cache key, the runtime SHALL persist `calls/to/cache/.json` as an object with two sorted, deduped lists: + +- `indexes`: calling user-dag index ids +- `cache_keys`: calling fn-dag cache keys + +#### Scenario: Reverse lineage stores user-dag caller +- **WHEN** a user-dag with `index_id` initiates a new execution for callee cache key `ck1` +- **THEN** the runtime SHALL add that `index_id` to `calls/to/cache/ck1.json.indexes` + +#### Scenario: Reverse lineage stores fn-dag caller +- **WHEN** an fn-dag with caller cache key `ck0` initiates a new execution for callee cache key `ck1` +- **THEN** the runtime SHALL add `ck0` to `calls/to/cache/ck1.json.cache_keys` + +### Requirement: Call-edge indexes SHALL represent attempted invocation lineage +The runtime SHALL write call-edge indexes only on the new-execution path, after it acquires the `cache_key` lock and confirms there is no active execution for that callee cache key. The runtime SHALL NOT delete call-edge indexes on terminal success or failure. + +#### Scenario: Resume does not create duplicate lineage writes +- **WHEN** `start_fn` resumes an already active execution for a callee cache key +- **THEN** it SHALL NOT rewrite the forward or reverse call-edge indexes for that execution + +#### Scenario: Failed execution preserves lineage +- **WHEN** a newly created execution later fails +- **THEN** the previously recorded call-edge indexes SHALL remain queryable + +### Requirement: Call-edge index updates SHALL be concurrency-safe and canonicalized +Each call-edge update SHALL perform a full read, merge, dedupe, and sort before a conditional ETag-checked write. On an ETag conflict, the runtime SHALL retry the full read/merge/write sequence until the write succeeds or the operation aborts. + +#### Scenario: Concurrent writes preserve both callers +- **WHEN** two callers concurrently update the same reverse index for callee cache key `ck1` +- **THEN** the runtime SHALL retry conflicts until both callers are present in the stored lists + +#### Scenario: Repeated edge writes remain canonical +- **WHEN** the same caller/callee edge is recorded more than once across retries or repeated new-execution attempts +- **THEN** the stored forward and reverse indexes SHALL remain deduped and sorted diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..c596382 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/specs/runtime-execution-records/spec.md @@ -0,0 +1,76 @@ +## ADDED Requirements + +### Requirement: Runtime SHALL separate cache identity from execution identity +The runtime SHALL treat `cache_key` as the stable computation identity and `execution_id` as the stable identity of one in-flight execution attempt. The runtime SHALL acquire execution locks by `cache_key` and SHALL propagate `execution_id` in the adapter envelope. + +#### Scenario: First launch creates a new execution identity +- **WHEN** `start_fn` observes a cache miss and confirms there is no active execution for the computed `cache_key` +- **THEN** it creates a new `execution_id` for that launch attempt +- **AND** it invokes the adapter with both `cache_key` and `execution_id` + +#### Scenario: Resume preserves the current execution identity +- **WHEN** `start_fn` observes an active execution for a `cache_key` +- **THEN** it SHALL reuse the referenced `execution_id` +- **AND** it SHALL NOT create a new `execution_id` for that execution while resuming it + +### Requirement: Runtime SHALL maintain an active execution pointer per cache key +The runtime SHALL persist the currently active execution for a `cache_key` at `active/` as plain text containing only the `execution_id`. + +#### Scenario: Active pointer is created for a new running execution +- **WHEN** the first adapter call for a new execution returns `running` +- **THEN** the runtime SHALL create `active/` containing that execution's `execution_id` + +#### Scenario: Stale active pointer is discarded +- **WHEN** `active/` exists but `exec/.json` does not exist +- **THEN** the runtime SHALL delete `active/` +- **AND** it SHALL treat the cache key as having no active execution + +### Requirement: Runtime SHALL create immutable execution records +The runtime SHALL persist `exec/.json` only on the first non-terminal adapter result for an execution. That record SHALL contain the `execution_id`, the `cache_key`, the terminal-or-running status captured at creation time, and the durable adapter state returned from the first launch call. The runtime SHALL NOT modify that record after creation. + +#### Scenario: First running result creates the execution record +- **WHEN** the first adapter call for a new execution returns `running` with durable state +- **THEN** the runtime SHALL create `exec/.json` containing that state +- **AND** it SHALL NOT rewrite that object on later resumes + +#### Scenario: Resume uses stored immutable state +- **WHEN** `start_fn` resumes an active execution +- **THEN** it SHALL load the adapter `state` from `exec/.json` +- **AND** it SHALL pass that stored state to the adapter + +### Requirement: Adapter envelope and result schema SHALL follow the runtime-owned execution contract +The adapter envelope SHALL include `argv_ptr`, `cache_key`, `execution_id`, `remote`, `runnable`, and `state`. The adapter result SHALL use only `running`, `succeeded`, or `failed` statuses. `running` MUST include durable `state`. `succeeded` MUST include `dag_id`. `failed` MUST include `error`. + +#### Scenario: First adapter call uses null state +- **WHEN** the runtime invokes an adapter for a new execution +- **THEN** the adapter envelope SHALL include `state = null` + +#### Scenario: Later adapter state is ignored +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with a different `state` +- **THEN** the runtime SHALL ignore the returned replacement state +- **AND** it SHALL continue to treat the original execution record as authoritative + +#### Scenario: Pending is rejected +- **WHEN** an adapter returns `pending` +- **THEN** the runtime SHALL reject that result as invalid adapter output + +### Requirement: Stale lock recovery SHALL preserve active execution ownership +The runtime SHALL use the lock for `cache_key` only to coordinate mutation of the active execution. If a lock is stale and an active execution record exists, the runtime SHALL recover the lock and resume that execution instead of creating a new one. + +#### Scenario: Stale lock with active execution resumes existing execution +- **WHEN** the lock for a `cache_key` is stale and `active/` points to an existing execution record +- **THEN** the runtime SHALL recover the lock +- **AND** it SHALL resume the existing `execution_id` +- **AND** it SHALL NOT launch a duplicate execution + +### Requirement: Failed execution SHALL be cached as a terminal result +If an adapter returns `failed`, the runtime SHALL complete the DAG with the error and SHALL publish that failed terminal outcome to cache for the `cache_key`. + +#### Scenario: Failed adapter result populates cache +- **WHEN** an adapter returns `failed` for a cache key +- **THEN** the runtime SHALL complete the DAG with the reported error +- **AND** it SHALL publish the failed outcome into cache for that cache key + +#### Scenario: Failed execution clears active pointer +- **WHEN** an active execution returns `failed` +- **THEN** the runtime SHALL delete `active/` before surfacing the failure diff --git a/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/tasks.md b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/tasks.md new file mode 100644 index 0000000..5b191b5 --- /dev/null +++ b/openspec/changes/archive/2026-04-25-simplify-exec-state-and-track-call-edges/tasks.md @@ -0,0 +1,31 @@ +## 1. Runtime execution identity and storage + +- [x] 1.1 Add runtime helpers for `active/` plain-text pointers and immutable `exec/.json` records under the remote S3 root. +- [x] 1.2 Update stale-lock handling so `start_fn` recovers an existing active execution when `exec/.json` exists and discards stale active pointers when it does not. +- [x] 1.3 Update the new-execution path to create a new `execution_id`, create the immutable execution record on the first `running` result, and create the active pointer only for non-terminal executions. + +## 2. Adapter contract and execution flow + +- [x] 2.1 Update adapter payload validation and invocation so envelopes include `execution_id` and `state` on both first launch and resume calls. +- [x] 2.2 Remove `pending` from adapter output validation and enforce `running|succeeded|failed` with the required `state`, `dag_id`, and `error` fields. +- [x] 2.3 Update `start_fn` result handling so resumed executions always use the immutable stored state and ignore replacement state returned by later adapter calls. +- [x] 2.4 Update terminal handling so adapter `failed` completes the DAG with the error and publishes the failed result to cache before surfacing the failure. + +## 3. Executor and adapter migration + +- [x] 3.1 Update contrib executors and adapters to return all durable resume state on the first `running` launch result and stop relying on executor-owned mutable resumable-state objects. +- [x] 3.2 Remove or replace executor-private state-prefix usage that conflicts with the runtime-owned execution-record model. +- [x] 3.3 Add executor-level coverage proving first-launch state is sufficient for later polling and that later returned replacement state is ignored. + +## 4. Call-edge lineage indexes + +- [x] 4.1 Add runtime helpers for `calls/from/index/.json`, `calls/from/cache/.json`, and `calls/to/cache/.json` objects. +- [x] 4.2 Record call edges only on the new-execution path, after lock acquisition and inactive confirmation, using `index_id` for user-dag callers and `caller cache_key` for fn-dag callers. +- [x] 4.3 Implement read/merge/dedup/sort/conditional-write retry logic for all call-edge index updates. +- [x] 4.4 Add tests covering bidirectional lookup, deduped/sorted storage, concurrent update retries, and persistence of lineage across success and failure. + +## 5. Documentation and regression coverage + +- [x] 5.1 Update execution-model and adapter/runtime contract docs for `execution_id`, immutable execution records, active pointers, stale-lock recovery, and failed-result cache publication. +- [x] 5.2 Add or update docs for call-edge lineage storage, including the distinction between user-dags (no `argv`, no cache key) and fn-dags (`argv`, cache key). +- [x] 5.3 Add integration coverage for end-to-end new execution, resume, stale-lock recovery, failed-result caching, and call-edge recording. diff --git a/openspec/changes/archive/2026-04-26-add-dml-checkout/.openspec.yaml b/openspec/changes/archive/2026-04-26-add-dml-checkout/.openspec.yaml new file mode 100644 index 0000000..1b4051e --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-dml-checkout/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-27 diff --git a/openspec/changes/archive/2026-04-26-add-dml-checkout/design.md b/openspec/changes/archive/2026-04-26-add-dml-checkout/design.md new file mode 100644 index 0000000..5c77655 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-dml-checkout/design.md @@ -0,0 +1,72 @@ +## Context + +The current git-like surface supports revision resolution for merge/revert and DAG-level checkout, but it does not provide a top-level checkout that switches active repository checkout state to another revision. Users need a direct way to move between branches and point-in-time commits/tags while preserving clear semantics for whether subsequent commits advance a branch. + +`IndexOps.commit` advances branch history when a branch head is active. That existing behavior is useful and should remain unchanged: checking out a non-branch should act like detached HEAD in git, with no active HEAD to advance. No runtime behavior changes are introduced in this change. + +## Goals / Non-Goals + +**Goals:** +- Add `dml checkout ` for commit/tag/branch and other supported revision inputs. +- Resolve revision values locally and classify them as branch-attached vs detached targets. +- Keep branch checkout behavior head-attached so new commits advance the selected branch. +- Keep non-branch checkout behavior detached by clearing active HEAD and relying on existing commit semantics. +- Define clone behavior as `fetch` followed by `checkout` so clone target semantics match checkout semantics. +- Support clone by branch or tag refs, but not by direct commit ref until fetch supports commit-target retrieval. +- Preserve local-only resolution behavior (no implicit network fetches). +- Return clear CLI feedback for attached vs detached mode. + +**Non-Goals:** +- Adding implicit fetch during checkout. +- Changing DAG-level checkout command semantics. +- Introducing new merge/rebase behavior as part of this change. + +## Decisions + +### Checkout target classification +`dml checkout` resolves the revision first, then classifies the resolved reference: +- Branch target -> attached mode (active branch set). +- Any non-branch target (tag, commit ref, ancestry expression result, remote-tracking ref) -> detached mode. + +Alternative considered: always create/attach a synthetic branch for non-branch targets. Rejected because it hides detached behavior and can cause accidental branch movement. + +### Detached checkout semantics use existing commit behavior +Detached checkout clears active HEAD. Commits created while detached use existing `IndexOps.commit` behavior: commit the index without moving any branch head. + +Alternative considered: introduce a new runtime checkout mode flag. Rejected because detached semantics already follow directly from the absence of an active HEAD. + +### Local-only revision resolution +Checkout uses existing local revision resolution without implicit remote operations. Unfetched remote URIs fail with a local-resolution error. + +Alternative considered: auto-fetch unresolved remote targets during checkout. Rejected to keep checkout deterministic, offline-safe, and consistent with existing revision rules. + +### Clone is fetch then checkout +`dml clone` reuses existing primitives by first fetching the requested remote ref, then running checkout against the fetched revision target. This keeps one source of truth for attach/detach semantics and avoids bespoke clone-only state transitions. + +For now, clone accepts targets that fetch can materialize as refs (branches and tags). Clone by direct commit is rejected because fetch does not yet support fetching arbitrary commit objects by commit id. + +Clone does not invoke `init`; it performs first-time repository initialization directly, then fetches and checks out the requested target. `init` hooks remain scoped to explicit `dml init` invocation and do not run during clone. + +Alternative considered: add direct commit fetch as part of this change. Rejected to keep scope focused on checkout semantics and avoid expanding remote transport behavior. + +### CLI mode visibility +`dml checkout` output explicitly states whether checkout is attached to a branch or detached, and identifies the resolved target. + +Alternative considered: minimal success output only. Rejected because explicit mode reporting avoids user confusion about later commit behavior. + +## Risks / Trade-offs + +- Detached behavior confusion for users expecting branch movement -> Mitigation: explicit checkout mode messaging and docs examples. +- Ambiguous revision parsing can attach/detach unexpectedly -> Mitigation: deterministic resolution precedence and tests across branch/tag/commit forms. +- Detached commits may be hard to recover if users forget refs -> Mitigation: ensure command output reports resolved commit and recommend branch checkout for durable progression. +- Clone target confusion for revision inputs -> Mitigation: document and enforce that clone currently supports fetched branch/tag refs, and return a specific error for direct commit clone attempts. + +## Migration Plan + +- No data migration required. +- Add checkout command and HEAD attach/detach handling without changing runtime internals. +- Update CLI/help docs and add tests before release. + +## Open Questions + +- None for initial implementation. diff --git a/openspec/changes/archive/2026-04-26-add-dml-checkout/proposal.md b/openspec/changes/archive/2026-04-26-add-dml-checkout/proposal.md new file mode 100644 index 0000000..0338215 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-dml-checkout/proposal.md @@ -0,0 +1,33 @@ +## Why + +Users can already check out a single DAG from another revision, but there is no top-level `dml checkout` for moving repository state to a specific revision target. Adding `dml checkout` now closes a core git-like workflow gap and makes branch-vs-detached HEAD behavior explicit. + +## What Changes + +- Add a new `dml checkout ` command that accepts commits, tags, branches, and other supported revision expressions. +- Extend revision resolution to infer the intended target kind (branch, tag, commit, remote-tracking ref, ancestry expression) and resolve it locally. +- Define `dml clone` as composition of `fetch` then `checkout`, using the fetched target as checkout input. +- Allow clone-by-tag when the tag can be fetched as a ref target, and reject clone-by-commit for now because fetch does not yet support direct commit fetch. +- Define checkout behavior differences: + - Branch checkout sets active HEAD to that branch so later commits advance branch history. + - Non-branch checkout (commit, tag, detached revision) clears active HEAD, so `IndexOps.commit` keeps its current behavior (commit index only, no branch advancement). +- Keep checkout local-only: no implicit network fetches for unresolved remote URIs. +- Surface clear CLI feedback showing whether checkout is branch-attached or detached. + +## Capabilities + +### New Capabilities + +- None. + +### Modified Capabilities + +- `git-like-commit-ops`: Extend revision resolution and checkout semantics from DAG-level checkout to top-level repository checkout. + +## Impact + +- CLI surface for new `dml checkout` command and status messaging. +- Head/index operations that decide whether commits advance branch pointers (branch checkout) or remain detached (no HEAD). +- Commit-ish parser/resolver behavior for branch/tag/commit inference. +- Clone flow behavior for branch/tag targets via `fetch -> checkout` composition and explicit non-support for commit-target clone. +- Tests for checkout target resolution, detached-mode behavior, and branch re-attachment. diff --git a/openspec/changes/archive/2026-04-26-add-dml-checkout/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-04-26-add-dml-checkout/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..b74d3eb --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-dml-checkout/specs/git-like-commit-ops/spec.md @@ -0,0 +1,74 @@ +## MODIFIED Requirements + +### Requirement: Revision resolution +The system SHALL resolve revision values used by git-like commands to concrete local commit refs without performing network fetches. + +#### Scenario: Resolve branch shorthand +- **WHEN** a command receives `main` as a revision +- **THEN** the system resolves it as local branch `main` + +#### Scenario: Resolve remote-tracking branch shorthand +- **WHEN** a command receives `origin/main` as a revision +- **THEN** the system resolves it through the configured remote URI to local tracking ref `dml:///#main` + +#### Scenario: Resolve fetched DML branch URI +- **WHEN** a command receives `dml://alice/tools#main` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Resolve fetched DML tag URI +- **WHEN** a command receives `dml://alice/tools@v1.0` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Unfetched DML URI is not fetched implicitly +- **WHEN** a command receives `dml://alice/tools#main` as a revision and no matching local tracking ref exists +- **THEN** the command fails without contacting the remote + +#### Scenario: Resolve first-parent ancestry +- **WHEN** a command receives `HEAD~2` as a revision +- **THEN** the system resolves it by walking two first-parent steps from the current head commit + +#### Scenario: Resolve local tag shorthand +- **WHEN** a command receives `v1.0` as a revision and `v1.0` resolves as a local tag +- **THEN** the system resolves it to the commit referenced by that tag + +### Requirement: Checkout repository state from revision +The system SHALL support checking out repository state from a resolved revision and SHALL distinguish branch-attached from detached checkouts. + +#### Scenario: Checkout branch attaches runtime +- **WHEN** `dml checkout main` resolves `main` to a local branch +- **THEN** the system sets active HEAD to branch `main` and reports branch-attached checkout + +#### Scenario: Checkout tag detaches runtime +- **WHEN** `dml checkout v1.0` resolves `v1.0` to a tag target commit +- **THEN** the system clears active HEAD and reports detached checkout at that commit + +#### Scenario: Checkout commit expression detaches runtime +- **WHEN** `dml checkout HEAD~1` resolves to a concrete commit +- **THEN** the system clears active HEAD and reports detached checkout at that commit + +#### Scenario: Commit while detached does not advance branch +- **WHEN** a user checks out a non-branch revision and then runs commit flow through `IndexOps.commit` +- **THEN** the system commits the index without advancing any branch head + +#### Scenario: Checkout unresolved remote URI fails locally +- **WHEN** `dml checkout dml://alice/tools#main` is requested and no local tracking ref exists for that URI +- **THEN** checkout fails without implicit fetch and reports that the revision cannot be resolved locally + +### Requirement: Clone composes fetch then checkout +The system SHALL implement clone as `fetch` followed by `checkout`, using the fetched target revision for checkout semantics. + +#### Scenario: Clone branch uses fetch then attached checkout +- **WHEN** `dml clone dml://alice/tools#main` is requested +- **THEN** the system fetches `dml://alice/tools#main` and checks out `main` as a branch-attached HEAD + +#### Scenario: Clone tag uses fetch then detached checkout +- **WHEN** `dml clone dml://alice/tools@v1.0` is requested +- **THEN** the system fetches `dml://alice/tools@v1.0` and checks out the resolved commit in detached mode + +#### Scenario: Clone direct commit is not supported yet +- **WHEN** `dml clone @` is requested for a direct commit target that is not fetchable as a branch/tag ref +- **THEN** clone fails with an error indicating direct commit clone is unsupported until fetch supports commit-target retrieval + +#### Scenario: Clone does not run init hooks +- **WHEN** `dml clone` initializes a local repository for the first time +- **THEN** the system does not invoke `dml init` and does not run init hooks as part of clone diff --git a/openspec/changes/archive/2026-04-26-add-dml-checkout/tasks.md b/openspec/changes/archive/2026-04-26-add-dml-checkout/tasks.md new file mode 100644 index 0000000..7ba2064 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-dml-checkout/tasks.md @@ -0,0 +1,40 @@ +## 1. CLI Checkout Surface + +- [x] 1.1 Add `dml checkout ` command wiring in the CLI command tree. +- [x] 1.2 Add CLI output/messages that explicitly report branch-attached vs detached scratch mode after checkout. +- [x] 1.3 Add CLI help text and usage examples for branch, tag, and commit-expression checkout. + +## 2. Clone Composition and Target Rules + +- [x] 2.1 Refactor/implement `dml clone` as `fetch` followed by `checkout`. +- [x] 2.2 Ensure clone target parsing accepts fetchable branch/tag refs and routes to checkout with consistent attach/detach semantics. +- [x] 2.3 Add explicit clone error for direct commit targets that fetch cannot retrieve yet. +- [x] 2.4 Ensure clone performs first-time repo initialization directly and does not invoke `init` hooks. + +## 3. Revision Resolution and Classification + +- [x] 3.1 Extend revision resolution to infer branch/tag/commit-like targets for checkout routing. +- [x] 3.2 Add local tag shorthand resolution coverage for checkout and keep branch/URI/`HEAD~N` behavior consistent. +- [x] 3.3 Ensure checkout resolution remains local-only and returns a clear error for unfetched remote URIs. + +## 4. Runtime Checkout Mode + +- [x] 4.1 Implement branch checkout flow that sets active HEAD to the selected branch. +- [x] 4.2 Implement non-branch checkout flow that clears active HEAD (detached). +- [x] 4.3 Keep runtime internals unchanged and rely on existing detached commit behavior. + +## 5. Commit Progression Semantics + +- [x] 5.1 Update commit/index operations so commits in detached mode do not advance shared branch heads. +- [x] 5.2 Preserve existing `IndexOps.commit` behavior without semantic changes. +- [x] 5.3 Validate transitions between detached and attached checkout states across consecutive checkout/commit operations. + +## 6. Tests and Documentation + +- [x] 6.1 Add tests for checkout resolution across branch, tag, commit ref, and `HEAD~N` expressions. +- [x] 6.2 Add tests verifying detached checkout commits index state without branch-head movement. +- [x] 6.3 Add tests verifying branch re-attachment resumes branch-head progression. +- [x] 6.4 Add tests for clone branch and clone tag using `fetch -> checkout` composition. +- [x] 6.5 Add tests verifying clone rejects direct commit targets until fetch supports commit retrieval. +- [x] 6.6 Add tests verifying clone does not invoke `init` or run init hooks. +- [x] 6.7 Update relevant docs for `dml checkout`, clone target semantics, and local-only checkout resolution behavior. diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/.openspec.yaml b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/.openspec.yaml new file mode 100644 index 0000000..3f1f00e --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-26 diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/design.md b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/design.md new file mode 100644 index 0000000..17c30ff --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/design.md @@ -0,0 +1,246 @@ +## Context + +DaggerML already models research history as immutable `Commit` objects, branch `Head` pointers, and `Tree.dags` mappings. Remote sync already stores immutable CAS objects and refs in S3, but branch publication is currently snapshot/tag-like rather than project/branch-oriented. + +This change introduces a monorepo-style remote namespace with many owners, projects, and branches sharing one CAS. It also defines git-like operations that move branch heads safely and create explicit commits for merges, reverts, and DAG checkout. + +## Goals / Non-Goals + +**Goals:** + +- Support remote project refs at `refs/projects///{heads,tags}/`, with mutable heads and immutable tags. +- Support local remotes such as `origin` mapped to a project URI and storage root. +- Support fetching an explicit project URI into a local remote-tracking namespace so users can checkout DAGs from other projects. +- Store fetched remote branches/tags locally under canonical normalized DML URIs. +- Initialize local projects under a project directory with all DML-managed state inside `.dml/`. +- Support shell hooks for `init` and `clone` so users can run their normal project bootstrap commands. +- Make `clone`, `fetch`, `pull`, and `push` behave like git at the branch level. +- Require safe push updates using both ETag compare-and-swap and fast-forward-only ancestry unless `--force` is specified. +- Provide DAG-level checkout from any commit-ish into the current branch as a new commit. +- Keep conflict handling explicit and deterministic. + +**Non-Goals:** + +- Individual DAG push/pull/package distribution is out of scope. +- Persistent merge-conflict state is out of scope for the initial design. +- Automatic tracking of imported DAG versions is out of scope. +- Non-S3 remotes are out of scope. + +## Decisions + +### Remote namespace + +Use this remote layout under the protocol root: + +```text +refs/projects///heads/.json +refs/projects///tags/.json +refs/cache/.json +refs/dags/.json +cas/sha256/// +``` + +Project refs are namespaced by owner and project, while CAS remains shared across the monorepo. This gives project isolation for discovery and permissions without losing object deduplication. + +### Mutable branch heads + +Branch heads under `refs/projects///heads/` are mutable pointers to commit manifests. Tags under `refs/projects///tags/` are immutable named refs; creating an existing tag path MUST fail. + +Project branch and tag refs use the same remote ref payload schema as existing manifest refs: `kind`, `schema`, `target`, `created_at`, `targets`, and `meta`. Project refs MUST point to commit manifests, MUST include `targets`, and MUST satisfy the same manifest-target integrity checks before a ref is written. + +Alternative considered: keep only immutable `tags//.json`. That makes exact snapshots simple, but it does not provide a natural latest branch pointer for `fetch`, `pull`, or `push`. + +### Global and local config + +Each local repo stores DML-managed state under `/.dml/`. The directory contains `.dml/config.toml`, the local object database directory `.dml/db/`, and `.dml/.gitignore` with `*` so DML internals are not committed to the enclosing source repository. + +Global DML config stores user defaults and bootstrap hooks. Its config directory resolves in this order: + +```text +1. $DML_CONFIG_HOME, if set +2. $XDG_CONFIG_HOME/dml, if set +3. ~/.config/dml +``` + +The global config file is `/config.toml`, for example `~/.config/dml/config.toml`: + +```toml +[user] +name = "alice" + +[defaults] +branch = "main" + +[hooks] +post-init = ["uv init"] +post-clone = ["uv sync"] +``` + +Each local repo stores project identity and named remotes in `.dml/config.toml`, for example: + +```toml +[project] +name = "my-project" +owner = "alice" +uri = "dml://alice/my-project" + +[branch] +current = "main" + +[remotes.origin] +uri = "dml://alice/my-project" +bucket = "example-bucket" +prefix = "team-monorepo" +``` + +Required project-local fields are: + +| Field | Purpose | +| --- | --- | +| `[project].name` | Local project name. | +| `[project].owner` | Project owner. | +| `[project].uri` | Canonical project URI, `dml:///`. | +| `[branch].current` | Current local branch name. | +| `[remotes.].uri` | Remote project URI. | +| `[remotes.].bucket` | Remote S3 bucket. | +| `[remotes.].prefix` | Remote S3 prefix containing the DML protocol root. | + +Configuration resolution uses waterfall precedence: + +```text +explicit CLI/API argument > environment variable > config file value +``` + +Environment variables use `DML_` names for config values, including `DML_USER`, `DML_DEFAULT_BRANCH`, `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, `DML_REMOTE_PROJECT`, `DML_BRANCH`, `DML_REMOTE`, `DML_REMOTE_ROOT`, `DML_REMOTE_BUCKET`, and `DML_REMOTE_PREFIX`. Explicit command arguments always win over environment variables and config. + +The supported DML environment variable surface for this project model is: + +| Env var | Role | +| --- | --- | +| `DML_CONFIG_HOME` | Global DML config directory override. | +| `DML_USER` | User identity and default project owner. | +| `DML_BRANCH` | Selected branch override for commands. | +| `DML_DEFAULT_BRANCH` | Global default branch override for init/branch defaults. | +| `DML_PROJECT_NAME` | Project name override and hook context. | +| `DML_PROJECT_OWNER` | Project owner override and hook context. | +| `DML_REMOTE_PROJECT` | Canonical local remote project override. | +| `DML_PROJECT_HOME` | Hook context: absolute project root directory. | +| `DML_HOOK` | Hook context: hook name such as `post-init` or `post-clone`. | +| `DML_REMOTE` | Selected named remote override. | +| `DML_REMOTE_NAME` | Hook context: remote name such as `origin`. | +| `DML_REMOTE_ROOT` | Remote project URI override. | +| `DML_REMOTE_BUCKET` | Remote S3 bucket override. | +| `DML_REMOTE_PREFIX` | Remote S3 prefix override. | + +The following legacy environment variables are removed from this project model and MUST NOT be used by new git-like project operations: + +| Env var | Replacement | +| --- | --- | +| `DML_REPO` | `.dml/db/` under the resolved project directory. | +| `DML_REMOTE_ROOT` | `[remotes.].bucket` and `[remotes.].prefix`, or `DML_REMOTE_BUCKET` and `DML_REMOTE_PREFIX`. | +| `DML_DYNAMODB_TABLE` | None; DynamoDB execution state is out of scope. | +| `DML_REMOTE_CACHE` | None; legacy cache naming is out of scope. | + +Project creation defaults owner to global `[user].name`, so creating `my-project` yields `dml://alice/my-project` unless an owner is explicitly provided. Branch creation defaults to global `[defaults].branch` when applicable, falling back to `main` when unset. + +### Init and clone directory setup + +`dml init ` creates `/`, initializes `/.dml/`, writes `.dml/config.toml`, writes `.dml/.gitignore` containing `*`, creates the local object database under `.dml/db/`, and creates an initial branch with an empty initial commit/tree. + +If `/` already exists, `dml init ` fails. Users who want to initialize an existing directory must `cd` into that directory and run `dml init --here `. + +`dml init --here ` initializes the current directory instead of creating a child directory. The project name still comes from ``. Hooks still run for `--here` unless the user also specifies `--no-hooks`. + +`dml clone dml:///` creates a local project directory, initializes `.dml/` the same way as `init`, records `origin`, fetches the selected remote branch, and initializes the local branch state from it. + +When clone is given only a project URI, it clones the configured default branch, falling back to `main`. A user may clone a different branch by specifying a branch commit-ish, for example `dml clone dml://alice/demo#experiment`. + +### Init and clone shell hooks + +`init` and `clone` support configured shell hooks for user-defined project bootstrap commands such as `uv init`. Hooks run in the project directory after it is created and after `.dml/` exists. Hook failures MUST stop the command and report the failing hook. + +Bootstrap hooks are read from global DML config because project-local config does not exist until the command creates `.dml/config.toml`. Hook keys are ordered lists named `post-init` and `post-clone`: + +```toml +[hooks] +post-init = ["uv init"] +post-clone = ["uv sync"] +``` + +Commands run hooks in listed order with the project directory as the working directory. `init` runs only `hooks.post-init`; `clone` runs only `hooks.post-clone`. `dml init --no-hooks ` and `dml init --here --no-hooks ` skip `post-init`; `dml clone --no-hooks ` skips `post-clone`. + +Hook commands receive environment variables describing the invocation: + +```text +DML_HOOK=post-init|post-clone +DML_PROJECT_HOME=/absolute/path/to/project +DML_PROJECT_NAME= +DML_PROJECT_OWNER= +DML_CONFIG_HOME= +DML_BRANCH= +``` + +Clone hooks also receive: + +```text +DML_REMOTE_NAME=origin +DML_REMOTE_ROOT=dml:/// +``` + +Hooks are intentionally shell commands rather than Python callbacks so users can reuse their normal project setup tools. + +### Fetched remote tracking refs + +Fetched remote branch and tag pointers are tracked locally by their canonical DML URI. The underlying DB representation is an implementation detail and is not part of user-facing command syntax. + +Canonical tracking URIs are: + +```text +dml://alice/tools#main +dml://alice/tools#feature/x +dml://alice/tools@v1.0 +``` + +User-facing commands accept DML URI commit-ish values such as `dml://alice/tools#main` or configured remote shorthands such as `origin/main`; these resolve locally to the commit associated with the canonical DML URI. + +DML URIs used for project refs MUST be canonicalized before storage. Canonical project tracking URIs include only owner, project, and a concrete branch or tag identifier. Derived expressions such as `HEAD~2` are never stored as tracking URIs; if a user fetches or resolves an expression, the stored tracking ref uses the canonical remote branch or tag URI that produced the fetched commit. + +Canonical DML URIs MUST be no longer than the current ref-id limit of 64 bytes. Commands that create, configure, fetch, or push remote project refs MUST validate this limit before writing local heads or remote refs. + +The 64-byte limit is accepted for the initial project model. Overlong ASCII URIs fail validation rather than being hashed or stored through an alternate compatibility layer. + +Remote push/fetch MUST parse canonical DML URIs into structured remote paths such as `refs/projects///heads/.json` or `refs/projects///tags/.json`. Implementations MUST NOT use raw DML URI strings as remote object paths. + +### Fetch, pull, push, clone + +- `clone` creates a project directory, initializes `.dml/`, initializes local state from a remote project branch, and records `origin`. +- `fetch` downloads a remote branch head and materializes it into a local tracking ref for a canonical URI such as `dml://alice/my-project#main`. +- `fetch dml:///[identifier]` downloads an explicitly addressed project branch or tag and materializes it into a local tracking ref so users can inspect or checkout DAGs from that project without merging it into their current project. +- `pull` performs `fetch` and then merges the fetched remote-tracking head into the current/local branch. +- `push` uploads missing CAS/manifests and updates an existing remote branch head. If the remote branch does not exist, push fails unless `--create` is provided. + +Push MUST be fast-forward-only unless `--force` is provided. Push MUST still use an ETag conditional update even with `--force`, so force bypasses ancestry safety but not concurrent-update safety. `--create` writes only when the remote branch ref does not already exist; if another client creates the branch first, the create push fails. + +### Merge conflicts + +Merge operates on `Tree.dags`. A conflict occurs when both sides changed the same DAG name differently since the merge base. Initial UX aborts with a structured conflict list. Strategy flags can be added without introducing persistent conflict state. + +### DAG checkout + +Use one command for DAG-level extraction from history: + +```bash +dml dag checkout [--as ] [--replace] +``` + +The command resolves ``, reads `` from that commit's tree, writes the DAG ref into the current branch tree under the target name, creates a new commit, and advances the current head. Existing target names require `--replace` unless the ref is unchanged. Because explicit URI fetches create local remote-tracking heads, users can fetch another project and then checkout one of its DAGs into the current project. + +This covers DAG revert use cases without a separate `dml dag revert`; users can checkout from `HEAD~N`, a branch, a remote-tracking branch, a fetched DML URI, or an explicit commit ref. Commit-ish resolution is local-only: DML URIs resolve through existing local tracking refs and MUST NOT implicitly fetch from the network. + +## Risks / Trade-offs + +- Concurrent remote updates can race -> use ETag compare-and-swap for branch head writes. +- Fast-forward checks require commit ancestry availability -> fetch/materialize the remote head before evaluating push safety. +- Abort-only merge conflicts may be less convenient than git's conflict index -> start simpler and add explicit resolution flags later if needed. +- `commit-ish` parsing can become complex -> implement a small, documented local-only grammar first: full commit refs, local branch shorthands, configured remote shorthands, fetched DML URI tracking refs, `HEAD`, and `~N` ancestry. Keep internal DB refs out of user-facing syntax. +- Mutable branch refs require schema/version care -> keep branch head ref payloads validated and include target manifest metadata. diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/proposal.md b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/proposal.md new file mode 100644 index 0000000..dc23682 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/proposal.md @@ -0,0 +1,37 @@ +## Why + +DaggerML already has commits, heads, trees, and S3-backed CAS sync, but the remote UX does not yet behave like git for shared project branches. Users need a clear project/branch remote model with safe `clone`, `fetch`, `pull`, `push`, `merge`, and DAG checkout operations. + +## What Changes + +- Add a remote project namespace layout: `refs/projects///{heads,tags}/`. +- Add mutable remote branch heads and immutable project tags for git-like branch discovery and stable releases. +- Store fetched remote branches/tags locally under canonical normalized `dml:///#` or `dml:///@` URIs. +- Add global DML config under `$DML_CONFIG_HOME`, `$XDG_CONFIG_HOME/dml`, or `~/.config/dml` for user defaults and init/clone hooks. +- Add project-local config under `.dml/config.toml` for project identity and named remotes such as `origin`, enabling commands like `dml push origin main`. +- Define `dml init ` and `dml clone` project-directory initialization, including `/.dml/` config, database storage, `.dml/.gitignore`, and optional shell hooks for user project setup commands. +- Define git-like command semantics for `dml clone`, `dml fetch`, `dml pull`, `dml push`, `dml merge`, and `dml revert`, including explicit URI fetches such as `dml fetch dml:///[identifier]`. +- Define `dml dag checkout [--as ] [--replace]` for copying one DAG from another commit into the current branch as a new commit. +- Require push safety checks: conditional remote head update by ETag and fast-forward-only ancestry unless `--force` is specified. +- Keep `--force` subject to ETag checks to prevent lost-update races. + +## Capabilities + +### New Capabilities + +- `remote-project-refs`: Remote project namespace, branch/tag ref layout, local remote configuration, and safe branch push/fetch/pull behavior. +- `git-like-commit-ops`: User-facing merge, revert, and DAG checkout operations that create commits and advance heads. + +### Modified Capabilities + +- None. + +## Impact + +- Remote data model and protocol docs/specs for project refs, DML URI local tracking refs, and branch-head mutation. +- Internal remote ops for project-aware branch heads, immutable tags, conditional updates, fetch/pull/push, and clone. +- Commit/head ops for head-advancing merge/revert and DAG checkout. +- CLI commands for git-like branch operations and DAG checkout. +- Global DML configuration for user defaults and bootstrap hooks. +- Local repository configuration for project identity and named remotes. +- Local project initialization hooks for commands such as `uv init`. diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..a07bba6 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/git-like-commit-ops/spec.md @@ -0,0 +1,97 @@ +## ADDED Requirements + +### Requirement: Merge advances current head +The system SHALL merge another commit or branch into the current branch by creating a merge commit when needed and advancing the current head. + +#### Scenario: Merge non-conflicting branch +- **WHEN** a user merges a branch whose tree changes do not conflict with the current branch +- **THEN** the system creates a merge commit with both commits as parents and advances the current head to that merge commit + +#### Scenario: Merge fast-forward +- **WHEN** the current branch head is an ancestor of the merged commit +- **THEN** the system advances the current head to the merged commit without creating an unnecessary merge commit + +### Requirement: Merge detects DAG-name conflicts +The system SHALL reject merges where both sides changed the same DAG name to different DAG refs since the merge base. + +#### Scenario: Conflicting DAG name +- **WHEN** the merge base has `train -> dag:a`, the current branch has `train -> dag:b`, and the merged branch has `train -> dag:c` +- **THEN** merge fails with a conflict naming `train` and does not advance the current head + +### Requirement: Revert commit creates inverse commit +The system SHALL revert a commit by applying the inverse of that commit's tree diff to the current branch as a new commit. + +A revert SHALL only modify a DAG name when the current tree still matches the post-commit value introduced by the reverted commit. If the current tree no longer matches that post-commit value, revert SHALL fail with a conflict and SHALL NOT advance the current branch. + +#### Scenario: Revert added DAG +- **WHEN** the reverted commit added DAG name `train` +- **THEN** the revert commit removes `train` from the current branch tree if safe to apply + +#### Scenario: Revert changed DAG +- **WHEN** the reverted commit changed `train` from `dag:a` to `dag:b` +- **THEN** the revert commit changes `train` back to `dag:a` if the current tree still permits safe application + +#### Scenario: Revert changed DAG conflict +- **WHEN** the reverted commit changed `train` from `dag:a` to `dag:b` and the current tree has `train -> dag:c` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +#### Scenario: Revert added DAG conflict +- **WHEN** the reverted commit added `train -> dag:a` and the current tree has `train -> dag:b` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +#### Scenario: Revert removed DAG conflict +- **WHEN** the reverted commit removed `train -> dag:a` and the current tree already has `train -> dag:b` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +### Requirement: DAG checkout from commit-ish +The system SHALL support checking out one DAG from a resolved commit-ish into the current branch tree and committing that change. + +#### Scenario: Checkout DAG with same name +- **WHEN** `dml dag checkout HEAD~1 train` resolves `HEAD~1` to a commit containing `train -> dag:a` +- **THEN** the system creates a new commit whose tree contains `train -> dag:a` and advances the current head + +#### Scenario: Checkout DAG with alias +- **WHEN** `dml dag checkout origin/main train --as baseline_train` resolves `origin/main` to a commit containing `train -> dag:a` +- **THEN** the system creates a new commit whose tree contains `baseline_train -> dag:a` and advances the current head + +#### Scenario: Checkout refuses overwrite by default +- **WHEN** the target name already exists with a different DAG ref and `--replace` is not provided +- **THEN** DAG checkout fails without creating a commit or advancing the current head + +#### Scenario: Checkout replaces when requested +- **WHEN** the target name already exists with a different DAG ref and `--replace` is provided +- **THEN** DAG checkout creates a new commit with the target name pointing to the checked-out DAG ref + +### Requirement: Commit-ish resolution +The system SHALL resolve commit-ish values used by git-like commands to concrete local commit refs without performing network fetches. + +#### Scenario: Resolve branch shorthand +- **WHEN** a command receives `main` as a commit-ish +- **THEN** the system resolves it as local branch `main` + +#### Scenario: Resolve remote-tracking branch shorthand +- **WHEN** a command receives `origin/main` as a commit-ish +- **THEN** the system resolves it through the configured remote URI to local tracking ref `dml:///#main` + +#### Scenario: Resolve fetched DML branch URI +- **WHEN** a command receives `dml://alice/tools#main` as a commit-ish and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Resolve fetched DML tag URI +- **WHEN** a command receives `dml://alice/tools@v1.0` as a commit-ish and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Unfetched DML URI is not fetched implicitly +- **WHEN** a command receives `dml://alice/tools#main` as a commit-ish and no matching local tracking ref exists +- **THEN** the command fails without contacting the remote + +#### Scenario: Resolve first-parent ancestry +- **WHEN** a command receives `HEAD~2` as a commit-ish +- **THEN** the system resolves it by walking two first-parent steps from the current head commit + +### Requirement: DAG removal remains explicit +The system SHALL remove DAG names from the current branch tree only through an explicit DAG removal command, not through DAG checkout of an absent source. + +#### Scenario: Checkout absent DAG +- **WHEN** DAG checkout targets a commit that does not contain the requested DAG name +- **THEN** the command fails without deleting the target name from the current branch diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/remote-project-refs/spec.md b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..c42b6a7 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/specs/remote-project-refs/spec.md @@ -0,0 +1,352 @@ +## ADDED Requirements + +### Requirement: Remote project refs namespace +The system SHALL store project branch and tag refs under `refs/projects///{heads,tags}/` within the remote protocol root. + +#### Scenario: Branch head path +- **WHEN** project `alice/demo` branch `main` is addressed on the remote +- **THEN** the branch head ref path is `refs/projects/alice/demo/heads/main.json` + +#### Scenario: Tag path +- **WHEN** project `alice/demo` tag `v1.0` is addressed on the remote +- **THEN** the tag ref path is `refs/projects/alice/demo/tags/v1.0.json` + +### Requirement: Branch heads are mutable and project tags are immutable +The system SHALL allow project branch head refs to move through safe update operations and SHALL reject attempts to overwrite existing project tag refs. + +#### Scenario: Branch head update +- **WHEN** a push safely advances project `alice/demo` branch `main` +- **THEN** the existing `refs/projects/alice/demo/heads/main.json` ref may be replaced by the new branch head payload + +#### Scenario: Tag overwrite rejected +- **WHEN** `refs/projects/alice/demo/tags/v1.0.json` already exists +- **THEN** publishing tag `v1.0` fails without changing the existing tag ref + +### Requirement: Project refs use manifest ref payloads +The system SHALL encode project branch and tag refs using the existing remote ref payload schema for manifest refs. + +Project branch and tag refs SHALL point to commit manifests, SHALL include direct DAG `targets`, and SHALL fail before writing the ref if the target manifest is missing, invalid, or has `closure["dag"]` inconsistent with the ref `targets["dag"]`. + +#### Scenario: Project branch ref payload +- **WHEN** project `alice/demo` branch `main` is written +- **THEN** `refs/projects/alice/demo/heads/main.json` contains `kind`, `schema`, `target`, `created_at`, `targets`, and `meta` fields following the remote ref schema + +#### Scenario: Project tag ref payload +- **WHEN** project `alice/demo` tag `v1.0` is written +- **THEN** `refs/projects/alice/demo/tags/v1.0.json` contains `kind`, `schema`, `target`, `created_at`, `targets`, and `meta` fields following the remote ref schema + +#### Scenario: Project ref target validation fails closed +- **WHEN** a project branch or tag ref would point to a missing manifest, invalid manifest, non-commit manifest, or inconsistent direct DAG targets +- **THEN** the write fails without creating or updating the project ref + +### Requirement: Shared remote CAS +The system SHALL store immutable CAS objects in a shared remote CAS under `cas/sha256///` independent of owner, project, or branch. + +#### Scenario: Two projects reference same object +- **WHEN** two project refs target manifests that include the same CAS object +- **THEN** the remote stores that CAS object at one shared CAS path + +### Requirement: Global DML config +The system SHALL load global DML config from `$DML_CONFIG_HOME/config.toml`, `$XDG_CONFIG_HOME/dml/config.toml`, or `~/.config/dml/config.toml` in that precedence order. + +#### Scenario: DML config home wins +- **WHEN** `DML_CONFIG_HOME` is set +- **THEN** the system reads global config from `$DML_CONFIG_HOME/config.toml` + +#### Scenario: XDG config home fallback +- **WHEN** `DML_CONFIG_HOME` is unset and `XDG_CONFIG_HOME` is set +- **THEN** the system reads global config from `$XDG_CONFIG_HOME/dml/config.toml` + +#### Scenario: Default config fallback +- **WHEN** neither `DML_CONFIG_HOME` nor `XDG_CONFIG_HOME` is set +- **THEN** the system reads global config from `~/.config/dml/config.toml` + +### Requirement: Global user defaults +The system SHALL use global config for user defaults and bootstrap hook configuration. + +#### Scenario: Default project owner +- **WHEN** global config contains `[user].name = "alice"` and `dml init demo` omits an owner +- **THEN** the project owner is `alice` + +#### Scenario: Default branch +- **WHEN** global config contains `[defaults].branch = "main"` and `dml init demo` omits a branch +- **THEN** the initial branch is `main` + +### Requirement: Local remote config +The system SHALL store project-local config under `.dml/config.toml` containing project identity, current branch, and named remotes that map remote names to project URIs and storage roots. + +#### Scenario: Resolve origin main +- **WHEN** local config defines `origin` as `dml://alice/demo` +- **THEN** `dml push origin main` resolves to project owner `alice`, project `demo`, and branch `main` + +#### Scenario: Project fields are stored +- **WHEN** local project config is written for project `alice/demo` +- **THEN** `.dml/config.toml` contains `[project].name`, `[project].owner`, `[project].uri`, and `[branch].current` + +#### Scenario: Remote fields are stored +- **WHEN** local project config records remote `origin` +- **THEN** `.dml/config.toml` contains `[remotes.origin].uri`, `[remotes.origin].bucket`, and `[remotes.origin].prefix` + +#### Scenario: Reject overlong remote URI +- **WHEN** local config would store a canonical remote project URI longer than 64 bytes +- **THEN** config validation fails + +### Requirement: Config waterfall precedence +The system SHALL resolve configurable values using explicit CLI/API arguments first, environment variables second, and config file values last. + +#### Scenario: Explicit value wins over environment +- **WHEN** a command receives an explicit branch argument and `DML_BRANCH` is also set +- **THEN** the command uses the explicit branch argument + +#### Scenario: Environment wins over config +- **WHEN** a command omits a branch argument, `DML_BRANCH` is set, and config has `[branch].current` +- **THEN** the command uses `DML_BRANCH` + +#### Scenario: Config used as fallback +- **WHEN** a command omits a branch argument and `DML_BRANCH` is unset +- **THEN** the command uses the configured branch value + +#### Scenario: Remote storage env vars override config +- **WHEN** `DML_REMOTE_BUCKET` or `DML_REMOTE_PREFIX` is set for a remote operation +- **THEN** the command uses the environment value instead of the configured remote storage field + +### Requirement: Supported DML environment variables +The system SHALL support only the DML environment variables defined for the project model and SHALL treat hook context variables as output-only process context. + +#### Scenario: Global config home override +- **WHEN** `DML_CONFIG_HOME` is set +- **THEN** the global DML config directory resolves from `DML_CONFIG_HOME` + +#### Scenario: Existing user env remains supported +- **WHEN** `DML_USER` is set and an owner is omitted +- **THEN** the system uses `DML_USER` as the default project owner + +#### Scenario: Existing branch env remains supported +- **WHEN** `DML_BRANCH` is set and a command omits an explicit branch +- **THEN** the system uses `DML_BRANCH` as the selected branch + +#### Scenario: Project env overrides config +- **WHEN** `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, or `DML_REMOTE_PROJECT` is set +- **THEN** the corresponding project config value is overridden for that command + +#### Scenario: Remote env overrides config +- **WHEN** `DML_REMOTE`, `DML_REMOTE_ROOT`, `DML_REMOTE_BUCKET`, or `DML_REMOTE_PREFIX` is set +- **THEN** the corresponding remote selection or storage value is overridden for that command + +#### Scenario: Hook context env is provided by DML +- **WHEN** a hook command runs +- **THEN** DML sets `DML_HOOK`, `DML_PROJECT_HOME`, and, for clone hooks, `DML_REMOTE_NAME` + +### Requirement: Project commands use project-local state and current env names only +The system SHALL resolve project-local state from the project directory and SHALL use only the current supported environment variable surface for git-like project operations. + +#### Scenario: Project config comes from the project directory +- **WHEN** a project command resolves project-local config +- **THEN** it reads from `/.dml/config.toml` + +#### Scenario: DML_REPO is not used for project database +- **WHEN** a project command opens the local object database +- **THEN** it uses `/.dml/db/` and does not use `DML_REPO` + +#### Scenario: DML_REMOTE_ROOT is not used for named remotes +- **WHEN** a remote project command resolves remote storage +- **THEN** it uses named remote bucket/prefix config or `DML_REMOTE_BUCKET` and `DML_REMOTE_PREFIX`, not `DML_REMOTE_ROOT` + +#### Scenario: Removed execution/cache env vars are ignored +- **WHEN** `DML_DYNAMODB_TABLE` or `DML_REMOTE_CACHE` is set during a git-like project operation +- **THEN** the operation does not use those values + +### Requirement: Project directory initialization +The system SHALL initialize local project state under `/.dml/` for both `init` and `clone`. + +#### Scenario: Init creates DML directory +- **WHEN** `dml init demo` succeeds +- **THEN** the system creates `demo/.dml/`, `demo/.dml/config.toml`, and local database storage under `demo/.dml/db/` + +#### Scenario: Init refuses existing child directory +- **WHEN** `dml init demo` runs and `demo/` already exists +- **THEN** init fails and instructs the user to initialize that directory with `dml init --here demo` + +#### Scenario: Init here creates DML directory in current directory +- **WHEN** `dml init --here demo` succeeds from the current directory +- **THEN** the system creates `.dml/`, `.dml/config.toml`, and local database storage under `.dml/db/` + +#### Scenario: Init here uses provided project name +- **WHEN** `dml init --here demo` succeeds from directory `workdir` +- **THEN** the local project name is `demo` + +#### Scenario: Init creates DML gitignore +- **WHEN** `dml init demo` succeeds +- **THEN** the system writes `demo/.dml/.gitignore` containing `*` + +#### Scenario: Init creates initial branch +- **WHEN** `dml init demo` succeeds +- **THEN** local storage contains an initial empty commit/tree and the current branch is `main` + +#### Scenario: Clone creates DML directory +- **WHEN** `dml clone dml://alice/demo` succeeds +- **THEN** the system creates a project directory with `.dml/`, `.dml/.gitignore`, `.dml/config.toml`, and local database storage under `.dml/db/` + +### Requirement: Init and clone shell hooks +The system SHALL support `post-init` and `post-clone` shell hooks from global DML config that run in the project directory after `.dml/` exists. + +#### Scenario: Init hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init demo` runs +- **THEN** the hook command runs in the `demo` project directory after `demo/.dml/` exists + +#### Scenario: Init here hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init --here demo` runs +- **THEN** the hook command runs in the current directory after `.dml/` exists + +#### Scenario: Hooks run in configured order +- **WHEN** multiple `post-init` hook commands are configured and `dml init demo` runs +- **THEN** the hook commands run in their configured list order + +#### Scenario: Clone does not run init hooks +- **WHEN** both `post-init` and `post-clone` hooks are configured and `dml clone dml://alice/demo` runs +- **THEN** only `post-clone` hook commands run + +#### Scenario: Init no-hooks skips hooks +- **WHEN** `dml init --no-hooks demo` runs +- **THEN** no `post-init` hook commands run + +#### Scenario: Clone no-hooks skips hooks +- **WHEN** `dml clone --no-hooks dml://alice/demo` runs +- **THEN** no `post-clone` hook commands run + +#### Scenario: Hook environment is provided +- **WHEN** a `post-init` or `post-clone` hook command runs +- **THEN** the process environment includes `DML_HOOK`, `DML_PROJECT_HOME`, `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, `DML_CONFIG_HOME`, and `DML_BRANCH` + +#### Scenario: Clone hook fails +- **WHEN** a `post-clone` hook command exits non-zero during `dml clone dml://alice/demo` +- **THEN** clone fails and reports the failing hook command + +### Requirement: DML URIs track fetched remote refs +The system SHALL track fetched remote branches and tags locally by canonical normalized DML URI. + +#### Scenario: Store fetched branch tracking ref +- **WHEN** `dml fetch dml://alice/tools#main` succeeds +- **THEN** local storage tracks `dml://alice/tools#main` as pointing to the resolved commit + +#### Scenario: Store fetched tag tracking ref +- **WHEN** `dml fetch dml://alice/tools@v1.0` succeeds +- **THEN** local storage tracks `dml://alice/tools@v1.0` as pointing to the resolved commit + +#### Scenario: Tracking ref stores commit pointer +- **WHEN** a fetched remote ref is persisted locally +- **THEN** the persisted tracking ref contains the resolved commit pointer + +#### Scenario: Canonical URI head is stored +- **WHEN** a remote fetch resolves project `alice/tools` branch `main` +- **THEN** the local tracking ref is stored under canonical URI `dml://alice/tools#main` + +#### Scenario: Derived expression is not stored as URI head +- **WHEN** a remote operation resolves a derived expression such as `HEAD~2` +- **THEN** the system stores only the canonical project branch or tag URI for any tracking head it writes + +#### Scenario: URI tracking ref length is validated +- **WHEN** a command would create a tracking ref whose canonical DML URI exceeds 64 bytes +- **THEN** the command fails without writing the tracking ref + +#### Scenario: Overlong URI is rejected directly +- **WHEN** a canonical DML URI exceeds 64 bytes +- **THEN** the system rejects it and does not hash or rewrite it into an alternate tracking key + +#### Scenario: URI tracking ref characters are validated explicitly +- **WHEN** a command would create a DML URI tracking ref +- **THEN** the system validates the canonical URI as a DML project URI before writing the tracking ref + +#### Scenario: User-facing DML URI resolves to local tracking ref +- **WHEN** a user-facing command receives `dml://alice/tools#main` +- **THEN** the command resolves it locally through the tracking ref for `dml://alice/tools#main` + +### Requirement: Remote operations parse DML URIs +The system SHALL parse canonical DML URIs into structured remote project ref paths for remote operations. + +#### Scenario: Push parses branch URI +- **WHEN** push targets canonical URI `dml://alice/demo#main` +- **THEN** the remote branch path is `refs/projects/alice/demo/heads/main.json` + +#### Scenario: Fetch parses tag URI +- **WHEN** fetch targets canonical URI `dml://alice/demo@v1.0` +- **THEN** the remote tag path is `refs/projects/alice/demo/tags/v1.0.json` + +#### Scenario: Raw URI is not used as remote path +- **WHEN** a remote operation handles `dml://alice/demo#main` +- **THEN** it does not use `dml://alice/demo#main` as a raw remote object path segment + +### Requirement: Project creation owner default +The system SHALL default project owner to the configured current user when project creation omits an owner. + +#### Scenario: Create project without owner +- **WHEN** the configured user is `alice` and project `demo` is created without an explicit owner +- **THEN** the project URI is `dml://alice/demo` + +### Requirement: Fetch updates remote-tracking head +The system SHALL fetch a remote project branch by reading its branch head ref, materializing the referenced commit closure locally, and updating a local remote-tracking head. + +#### Scenario: Fetch origin main +- **WHEN** `dml fetch origin main` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/demo#main` as pointing to the fetched commit + +#### Scenario: Fetch explicit project URI +- **WHEN** `dml fetch dml://alice/tools#main` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/tools#main` as pointing to the fetched commit + +#### Scenario: Fetch explicit project tag URI +- **WHEN** `dml fetch dml://alice/tools@v1.0` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/tools@v1.0` as pointing to the fetched commit + +### Requirement: Pull fetches and merges +The system SHALL implement branch pull as fetch followed by merge of the fetched remote-tracking head into the current branch. + +#### Scenario: Pull origin main +- **WHEN** `dml pull origin main` succeeds while the current branch is `main` +- **THEN** local tracking ref `dml://alice/demo#main` is updated and local branch `main` advances to the merge result or fetched commit when already fast-forwardable + +#### Scenario: Pull different branch fails +- **WHEN** the current branch is `feature` and the user runs `dml pull origin main` +- **THEN** pull fails without merging or advancing the current branch + +### Requirement: Push uses ETag and fast-forward safety +The system SHALL update remote branch heads only with an ETag conditional write and SHALL reject non-fast-forward pushes unless force is requested. + +#### Scenario: Fast-forward push +- **WHEN** the remote branch head is an ancestor of the local branch head and the observed ETag still matches +- **THEN** push updates the remote branch head to the local commit + +#### Scenario: Non-fast-forward push rejected +- **WHEN** the remote branch head is not an ancestor of the local branch head and force is not requested +- **THEN** push fails without updating the remote branch head + +#### Scenario: Force push keeps ETag safety +- **WHEN** force is requested and the observed ETag no longer matches +- **THEN** push fails without updating the remote branch head + +#### Scenario: Push missing branch without create fails +- **WHEN** push targets a remote branch ref that does not exist and `--create` is not provided +- **THEN** push fails without creating the remote branch ref + +#### Scenario: Push missing branch with create succeeds +- **WHEN** push targets a remote branch ref that does not exist and `--create` is provided +- **THEN** push writes the remote branch ref only if it still does not exist + +#### Scenario: Create push loses race +- **WHEN** push uses `--create` and another client creates the remote branch ref first +- **THEN** push fails without overwriting the remote branch ref + +### Requirement: Clone records origin +The system SHALL clone a remote project branch by initializing local state from the remote branch and recording the remote project as `origin`. + +#### Scenario: Clone project +- **WHEN** `dml clone dml://alice/demo` succeeds +- **THEN** local config contains an `origin` remote for `dml://alice/demo` and local branch state is initialized from the selected remote branch + +#### Scenario: Clone default branch +- **WHEN** `dml clone dml://alice/demo` omits an explicit branch commit-ish +- **THEN** clone fetches and initializes the configured default branch, falling back to `main` + +#### Scenario: Clone explicit branch +- **WHEN** `dml clone dml://alice/demo#experiment` succeeds +- **THEN** clone fetches and initializes branch `experiment` diff --git a/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/tasks.md b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/tasks.md new file mode 100644 index 0000000..91b8a32 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-add-git-like-remote-project-ops/tasks.md @@ -0,0 +1,54 @@ +## 1. Remote Project Model + +- [x] 1.1 Add remote path helpers for `refs/projects///heads/.json` and `refs/projects///tags/.json`. +- [x] 1.2 Extend remote ref validation to accept project owner, project, branch, and tag path segments. +- [x] 1.3 Add branch head ref read/write support with ETag capture and conditional update. +- [x] 1.4 Add immutable project tag ref write support that rejects existing tag paths. +- [x] 1.5 Support canonical DML URI tracking refs for fetched remote branches and tags. +- [x] 1.6 Add explicit DML URI head validation, including canonicalization and the 64-byte URI limit. +- [x] 1.7 Keep existing cache and DAG ref behavior compatible with the shared CAS layout. + +## 2. Local Config + +- [x] 2.1 Define `.dml/` project directory layout for `.dml/config.toml`, local object database storage under `.dml/db/`, and `.dml/.gitignore`. +- [x] 2.2 Define global config resolution for `$DML_CONFIG_HOME`, `$XDG_CONFIG_HOME/dml`, and `~/.config/dml`. +- [x] 2.3 Define global config storage for `[user]`, `[defaults]`, and ordered `[hooks]` `post-init` and `post-clone` command lists. +- [x] 2.4 Define local config storage for required `[project]`, `[branch]`, and `[remotes.]` fields. +- [x] 2.5 Implement config load/save helpers with validation for remote names, storage fields, and `dml:///` URIs. +- [x] 2.6 Implement config waterfall resolution: explicit CLI/API argument, then supported environment variable, then config file value. +- [x] 2.7 Remove git-like project operation dependencies on obsolete env vars such as `DML_REPO`, `DML_REMOTE_ROOT`, `DML_DYNAMODB_TABLE`, and `DML_REMOTE_CACHE`. +- [x] 2.8 Add project creation behavior that defaults owner and branch from global config. +- [x] 2.9 Implement `dml init ` to create the project directory, `.dml/`, `.dml/config.toml`, `.dml/db/`, `.dml/.gitignore`, and initial branch commit. +- [x] 2.10 Implement `dml init --here ` to initialize the current directory while still running hooks unless `--no-hooks` is set. +- [x] 2.11 Implement `post-init` and `post-clone` shell hook execution with hook environment variables and `--no-hooks` support. + +## 3. Branch Remote Operations + +- [x] 3.1 Implement clone to create the project directory, initialize `.dml/`, initialize local state from a remote project branch, record `origin`, and run clone hooks. +- [x] 3.2 Implement fetch to materialize a configured remote branch and update local tracking ref `dml:///#`. +- [x] 3.3 Implement explicit URI fetch for `dml:///[identifier]` into a local remote-tracking head. +- [x] 3.4 Implement pull as fetch plus merge into the current/local branch. +- [x] 3.5 Implement push with closure upload, DML URI parsing into structured remote project paths, fast-forward ancestry validation, and ETag conditional branch-head update for existing branches. +- [x] 3.6 Implement push `--create` to create missing remote branches only when the ref does not already exist. +- [x] 3.7 Implement `--force` push to bypass fast-forward validation while still requiring ETag conditional update. + +## 4. Commit and Head Operations + +- [x] 4.1 Add head-advance/update operation for moving a branch head to a resolved commit. +- [x] 4.2 Update merge flow so user-facing merge advances the current head and fast-forwards when possible. +- [x] 4.3 Add structured merge conflict reporting for DAG-name conflicts. +- [x] 4.4 Implement commit revert by applying an inverse tree diff to the current branch as a new commit with safe-application conflict checks. + +## 5. DAG Checkout + +- [x] 5.1 Implement commit-ish resolution for commit refs, local heads, remote-tracking heads, `HEAD`, and first-parent `~N` syntax. +- [x] 5.2 Implement `dag checkout` tree update from source commit/name to target name with default overwrite refusal. +- [x] 5.3 Implement `--as` and `--replace` behavior for DAG checkout. +- [x] 5.4 Ensure checkout of an absent source DAG fails without deleting local DAG names. + +## 6. CLI and Docs + +- [x] 6.1 Add CLI commands for `init`, `clone`, `fetch`, `pull`, `push`, `merge`, `revert`, and `dag checkout`. +- [x] 6.2 Update remote, commit, DAG, CLI, and config documentation for the new behavior. +- [x] 6.3 Add tests for remote project ref paths, config resolution, fetch/pull/push safety, merge conflicts, revert, and DAG checkout. +- [x] 6.4 Run the relevant test suite and update any fixtures affected by remote layout changes. diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/.openspec.yaml b/openspec/changes/archive/2026-04-26-clean-up-configuration/.openspec.yaml new file mode 100644 index 0000000..3f1f00e --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-26 diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/design.md b/openspec/changes/archive/2026-04-26-clean-up-configuration/design.md new file mode 100644 index 0000000..3d9fe4e --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/design.md @@ -0,0 +1,75 @@ +## Context + +Configuration concerns are currently concentrated in `src/daggerml/_config.py`, which mixes runtime resolution, global config loading, project config persistence, project-layout helpers, and hook execution. At the same time, `api.py` and the CLI both sit on top of `_internal`, but they do not yet read as thin bindings over one shared configuration model. + +The cleanup needs to make `_internal` the clear package boundary for configuration and operations without weakening existing contracts such as project-local `.dml/config.toml` and explicit required remote configuration for remote-backed flows. It also needs to document that frontend parity is the goal for shared config/ops behavior even though some behaviors remain API-only because CLI serialization cannot represent them cleanly. + +## Goals / Non-Goals + +**Goals:** +- Define one canonical internal config model and one shared resolution path used by both API and CLI. +- Keep one resolver implementation while supporting two scopes: `project/runtime` and `global`. +- Preserve the distinction between different config sources while making them inputs to the same resolved internal config. +- Reduce the canonical config surface to a small set of URI- and path-based fields with helper accessors for parsed components. +- Make derived values, especially remote configuration handed to remote-aware ops, flow from resolved config objects rather than ad hoc env/file reads in frontends. +- Update docs and tests so configuration behavior is described consistently across API and CLI frontends. +- Explicitly document which functionality is intentionally missing from the CLI because serialization constraints prevent a practical command-line surface. + +**Non-Goals:** +- Redesigning remote project refs, hook semantics, or storage layout. +- Adding new end-user configuration features beyond the cleanup needed to unify existing behavior. +- Introducing compatibility aliases for old or experimental env-var names that are outside the current documented contract. +- Eliminating API-only functionality that depends on Python object or function serialization the CLI cannot represent. + +## Decisions + +### Use one canonical internal config model with multiple source adapters +The implementation will define one canonical resolved config model owned by `_internal`. Explicit arguments, environment variables, project-local config, and global config remain distinct sources, but they must all feed the same internal resolution path consumed by both API and CLI frontends. + +Alternative considered: keep separate runtime and project-command resolved models as the long-term boundary. +Why not: it conflicts with the intended architecture where `_internal` is the package and `api.py` and CLI are frontends over the same underlying behavior. + +### Keep source-specific loading but centralize normalization in `_internal` +Project-local config, global config, and environment-variable handling can still be loaded through source-specific helpers, but normalization, precedence, validation, and derivation belong in `_internal` shared code. Frontends should not embed their own config semantics. + +Alternative considered: let each frontend adapt raw sources differently and only converge at ops calls. +Why not: it preserves duplicated behavior and makes API/CLI parity hard to test. + +### Normalize remote configuration before remote-aware components are constructed +Remote-aware components should receive already-resolved `remote.uri` values from the shared internal resolver rather than inspecting raw environment variables or project config files directly. The cleanup removes overlapping remote config forms and keeps parsed remote bucket/prefix details as helpers on the resolved config object instead of separate canonical parameters. + +Alternative considered: keep `remote.root`, `remote.uri`, and bucket/prefix config side by side. +Why not: multiple canonical representations for the same remote location are a major source of current config complexity. + +### Normalize project identity into `remote.project` +The resolved config model will use `remote.project` as the canonical project identity, and the resolver will normalize that URI to always include a branch, defaulting from `default_branch` when needed. `remote.project` will never normalize to a tag form because tags are immutable and are not valid active project context. Code that needs the branch will use a `project.branch` helper on the resolved config object instead of a standalone canonical `branch` parameter. + +Alternative considered: keep a separate canonical `branch` config parameter. +Why not: it duplicates information already carried by the normalized project URI and creates another overlap point between API and CLI. + +### Keep `db.path` as an explicit overridable field with a dynamic default +The resolved config model will include `db.path`, with the same resolution order as other `project/runtime` fields, but its default will be computed dynamically as `project.home/.dml/db/`. This keeps thin runtimes workable because they can set `DML_DB_PATH` directly without requiring richer project config. + +Alternative considered: derive the DB path only implicitly from `project.home` and never expose it as config. +Why not: some thin runtimes need to point directly at a DB path without carrying the rest of the project setup. + +### Treat API and CLI as parity frontends with documented CLI gaps +The implementation should aim for API and CLI to use the same config and ops machinery under the hood. Where CLI cannot expose a feature because command-line serialization cannot faithfully represent Python-level inputs or outputs, the limitation should be documented explicitly rather than modeled as a config difference. + +Alternative considered: make CLI omissions implicit and leave parity undefined. +Why not: it hides real product constraints and makes it unclear whether a behavior difference is intentional or a bug. + +## Risks / Trade-offs + +- [Cross-cutting caller updates] -> Update API, CLI, and shared internal config helpers in the same change and verify them together. +- [Behavior drift between docs and code] -> Treat the OpenSpec artifacts and `docs/configuration.md` as part of the same cleanup so the authoritative contract stays aligned. +- [Frontend parity assumptions become too strong] -> Document serialization-driven CLI gaps explicitly so parity expectations are accurate. +- [URI normalization edge cases] -> Add tests for project URIs with and without explicit branches and for canonical remote URI parsing helpers. + +## Migration Plan + +No persisted data migration is required. The change should land as one coordinated refactor that updates shared internal configuration helpers, their frontend callers, and documentation/tests together. + +## Open Questions + +- Whether the cleanup is best expressed as one reorganized `_config.py` module or as a small split into dedicated internal configuration modules can be left to implementation, as long as the scoped resolver and canonical config contract in the specs remain the same. diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/proposal.md b/openspec/changes/archive/2026-04-26-clean-up-configuration/proposal.md new file mode 100644 index 0000000..72efee0 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/proposal.md @@ -0,0 +1,28 @@ +## Why + +Configuration is currently split across runtime config, global config, and project config helpers that live in one module but follow different shapes, precedence rules, and environment-variable conventions. That makes the configuration surface hard to reason about, increases duplication between the API and CLI frontends, and leaves mismatches between the documented contract and the behavior the code actually exposes. + +## What Changes + +- Define one canonical internal configuration contract owned by `_internal` and used by both `api.py` and the CLI. +- Normalize explicit arguments, environment variables, project-local config, and global config through one shared resolver with `project/runtime` and `global` scopes. +- Reduce overlapping config names to a smaller canonical set: `project.home`, `remote.project`, `db.path`, `remote.uri`, `user`, `default_branch`, and `hooks.post-{init,clone}`. +- Make `remote.project` canonical for project identity and branch context by normalizing it to always include a branch, never a tag, with `project.branch` exposed as a helper rather than a standalone config parameter. +- Default `db.path` dynamically from `project.home/.dml/db/` so thin runtimes can operate by setting env vars directly. +- Clarify which config values are canonical, where derived values come from, and which helpers are responsible for validation and precedence. +- Update API and CLI call sites to use the same shared config resolution path instead of frontend-specific translation. +- Document that some API-backed behaviors remain unavailable in the CLI where object/function serialization prevents a practical CLI surface. + +## Capabilities + +### New Capabilities +- `shared-internal-configuration`: Define the canonical internal configuration model and the shared resolution path used by both API and CLI frontends. + +### Modified Capabilities +- `required-remote-config`: Clarify how required remote configuration is normalized and handed to remote-aware components through the shared internal config path. + +## Impact + +- Affected code: `src/daggerml/_config.py` or successor internal config modules, config consumers in `src/daggerml/api.py`, CLI entry points under `src/daggerml/_cli/`, and related tests/docs. +- Affected APIs: configuration dataclasses/helpers, env-var resolution, URI parsing helpers, and shared API/CLI config loading behavior. +- Affected systems: frontend bootstrap, remote configuration handling, and configuration-focused tests/docs. diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/required-remote-config/spec.md b/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/required-remote-config/spec.md new file mode 100644 index 0000000..11da127 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/required-remote-config/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST NOT model remote configuration as optional, MUST NOT provide `None` defaults for required remote parameters, and MUST receive normalized `remote.uri` configuration from the shared internal configuration resolver rather than reading raw environment variables or project config files themselves. + +#### Scenario: Remote-aware ops constructor requires remote URI +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete normalized remote URI argument rather than an optional remote parameter + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +#### Scenario: Remote-aware component does not resolve env vars directly +- **WHEN** a remote-aware runtime or ops component is used in a remote-backed flow +- **THEN** it receives already-resolved remote configuration from its caller instead of inspecting `DML_REMOTE`, older remote env-var forms, or project config files directly diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..7a0074a --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/specs/shared-internal-configuration/spec.md @@ -0,0 +1,82 @@ +## ADDED Requirements + +### Requirement: API and CLI use one shared internal configuration model +The system SHALL resolve configuration through one canonical internal configuration model owned by `_internal`. Both `daggerml.api` and the CLI SHALL use that shared internal resolver rather than maintaining frontend-specific configuration semantics. + +#### Scenario: API and CLI share resolution behavior +- **WHEN** API code and CLI code resolve the same explicit values, environment variables, and config-file inputs +- **THEN** they produce the same resolved internal configuration for the underlying operation + +#### Scenario: Frontends remain thin bindings +- **WHEN** a frontend prepares to invoke shared internal operations +- **THEN** it delegates configuration precedence, validation, and derivation to shared internal configuration code instead of re-implementing those rules locally + +### Requirement: One resolver supports `project/runtime` and `global` scopes +The system SHALL expose one shared internal resolver that supports `project/runtime` and `global` scopes. Both scopes MUST use the same precedence model, but they load different config-file layers according to scope. + +#### Scenario: Project scope loads project and global config layers +- **WHEN** configuration is resolved in `project/runtime` scope +- **THEN** the resolver applies `explicit > environment variables > project config > global config > defaults` + +#### Scenario: Global scope omits project config +- **WHEN** configuration is resolved in `global` scope +- **THEN** the resolver applies `explicit > environment variables > global config > defaults` without requiring a project config file + +### Requirement: Canonical config parameters are reduced to one normalized set +The system SHALL normalize supported configuration inputs into the canonical internal parameters `project.home`, `remote.project`, `db.path`, `remote.uri`, `user`, `default_branch`, `hooks.post-init`, `hooks.post-clone`, and `config_home`. + +#### Scenario: Legacy overlapping branch parameter is not canonical +- **WHEN** project configuration is resolved +- **THEN** branch context is carried by normalized `remote.project` rather than by a separate canonical `branch` parameter + +#### Scenario: Legacy overlapping remote parameters are not canonical +- **WHEN** remote-backed configuration is resolved +- **THEN** the canonical remote parameter is `remote.uri` rather than separate `remote.root`, `remote.bucket`, or `remote.prefix` parameters + +### Requirement: Multiple config sources normalize into the shared internal model +The system SHALL treat explicit arguments, environment variables, project-local config, and global config as sources that feed the shared internal configuration model. Source-specific loading may differ, but normalization and precedence MUST be centralized in the shared internal resolver. + +#### Scenario: Project-local and global config feed shared resolution +- **WHEN** a frontend resolves configuration for an operation in a project directory +- **THEN** project-local `.dml/config.toml` and any applicable global config inputs are loaded as sources for the same shared internal resolution path + +#### Scenario: Environment values are normalized centrally +- **WHEN** configuration is resolved from environment variables +- **THEN** the shared internal resolver, not the frontend, maps those values into the canonical internal configuration model + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize `remote.project` so that resolved project configuration always includes a branch and never a tag. The resolved config object SHALL expose a `project.branch` helper derived from the normalized URI. + +#### Scenario: Missing branch normalizes from default branch +- **WHEN** `remote.project` is provided without a branch in `project/runtime` scope +- **THEN** the resolver appends the effective default branch to the normalized `remote.project` + +#### Scenario: Tag URI is rejected for project context +- **WHEN** `remote.project` is provided with a tag selector +- **THEN** project configuration resolution fails because active project context must target a branch, not an immutable tag + +#### Scenario: Project branch helper is derived from normalized URI +- **WHEN** resolved configuration includes `remote.project` +- **THEN** `project.branch` returns the branch encoded in the normalized URI rather than reading a standalone branch config parameter + +### Requirement: DB path can be overridden but defaults from project home +The system SHALL resolve `db.path` with the same precedence as other `project/runtime` parameters, and when no higher-precedence value is provided it SHALL default to `/.dml/db/`. + +#### Scenario: Explicit DB path overrides dynamic default +- **WHEN** `db.path` is provided explicitly or through `DML_DB_PATH` +- **THEN** the resolved config uses that DB path instead of deriving it from `project.home` + +#### Scenario: DB path defaults from project home +- **WHEN** `db.path` is not provided and resolved config includes `project.home` +- **THEN** `db.path` resolves to `/.dml/db/` + +### Requirement: CLI limitations caused by serialization are documented, not treated as config divergence +The system SHALL document operations that remain unavailable in the CLI because command-line serialization cannot faithfully represent the required Python-level inputs or outputs. These omissions MUST NOT create a separate CLI-specific configuration model. + +#### Scenario: Serialization-limited API behavior stays API-only +- **WHEN** an operation such as `start_fn` depends on Python object or function serialization that the CLI cannot represent cleanly +- **THEN** the documentation identifies that operation as unavailable in the CLI while preserving the shared internal configuration model for supported operations + +#### Scenario: Missing CLI feature does not imply different config rules +- **WHEN** a capability is supported by both API and CLI +- **THEN** both frontends use the same shared internal configuration rules for that capability diff --git a/openspec/changes/archive/2026-04-26-clean-up-configuration/tasks.md b/openspec/changes/archive/2026-04-26-clean-up-configuration/tasks.md new file mode 100644 index 0000000..791e6b0 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-clean-up-configuration/tasks.md @@ -0,0 +1,19 @@ +## 1. Define Shared Internal Configuration + +- [x] 1.1 Refactor configuration code so `_internal` owns one canonical configuration model and one scope-aware resolver used by both API and CLI. +- [x] 1.2 Implement the canonical config parameters `project.home`, `remote.project`, `db.path`, `remote.uri`, `user`, `default_branch`, `hooks.post-init`, `hooks.post-clone`, and `config_home`, removing overlapping canonical params such as `branch`, `remote.root`, and named-remote config fields. +- [x] 1.3 Keep explicit args, environment variables, project-local config, and global config as sources that normalize through the shared precedence rules for `project/runtime` and `global` scopes. +- [x] 1.4 Normalize `remote.project` to always include a branch, reject tag-form project URIs, expose `project.branch` as a helper, and default `db.path` from `project.home/.dml/db/` when unset. +- [x] 1.5 Keep remote configuration normalization in shared internal code so remote-aware ops receive resolved `remote.uri` values rather than reading raw env or config files directly. + +## 2. Update API And CLI Frontends + +- [x] 2.1 Update `daggerml.api` to consume the shared internal configuration resolver instead of reconstructing config from frontend state. +- [x] 2.2 Update CLI entry points to consume the same shared internal configuration resolver for supported operations. +- [x] 2.3 Remove duplicated or ad hoc frontend-specific configuration translation paths that bypass shared internal config behavior, including old `DML_REPO` and `DML_BRANCH` assumptions. + +## 3. Verify And Document Frontend Parity + +- [x] 3.1 Update configuration tests to cover shared API/CLI resolution behavior, scope-aware precedence, project URI normalization, `project.branch` helper behavior, `db.path` dynamic defaulting, and remote-config handoff behavior. +- [x] 3.2 Update `docs/configuration.md` and related docs to describe `_internal` as the shared config boundary for API and CLI, document the canonical config table, and document intentional CLI gaps caused by serialization limits. +- [x] 3.3 Run the relevant test coverage for config, API, and CLI paths affected by the cleanup. diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/.openspec.yaml b/openspec/changes/archive/2026-04-26-require-remote-config/.openspec.yaml new file mode 100644 index 0000000..3f1f00e --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-26 diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/design.md b/openspec/changes/archive/2026-04-26-require-remote-config/design.md new file mode 100644 index 0000000..4a10fc2 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/design.md @@ -0,0 +1,44 @@ +## Context + +Remote-backed runtime behavior already depends on a real remote root, but some constructors and helpers still expose remote configuration as optional in their signatures and defaults. That creates a split between the actual runtime contract and the type-level/API contract, which in turn encourages invalid test fixtures, dead fallback branches, and avoidable optional handling in remote-aware code. + +This change is cross-cutting because the contract is expressed in multiple places: remote-aware ops classes, execution helpers, runtime/config call sites, and test adapter scripts that sometimes use a remote-aware type only to borrow a transaction wrapper. + +## Goals / Non-Goals + +**Goals:** +- Make remote configuration required in every remote-aware constructor and helper signature. +- Remove optional typing and `None` defaults for remote-root and equivalent remote config parameters. +- Align tests and helper code with the required-remote contract. +- Keep purely local setup code on local-only primitives instead of remote-aware ops. + +**Non-Goals:** +- Changing remote protocol behavior or storage layout. +- Adding new runtime validation beyond the existing remote-root parsing/usage logic. +- Redesigning non-remote APIs that do not depend on remote state. + +## Decisions + +### Require remote config at the signature boundary +Remote-aware components will require explicit remote configuration in their public constructors and helpers rather than modeling it as optional. This matches the real operational contract and lets type checking enforce the same rule the runtime already depends on. + +Alternative considered: leave optional signatures and rely on runtime errors. +Why not: it preserves misleading APIs and keeps unnecessary optional branches throughout the codebase. + +### Fix local-only helpers by using local-only primitives +Tests or scripts that only need transaction access will switch from remote-aware ops classes to `BaseOps` or other local-only primitives. This keeps the required-remote contract intact without inventing fake remote defaults for code paths that are not actually remote-backed. + +Alternative considered: pass placeholder remote roots everywhere. +Why not: it hides the distinction between remote-aware and local-only flows and weakens the contract we are trying to make explicit. + +### Remove optional remote typing without adding compatibility shims +The implementation will directly remove `Optional`, `| None`, and `None` defaults for remote configuration fields and parameters. Callers must be updated in the same change. + +Alternative considered: temporary overloads or compatibility wrappers. +Why not: they prolong an unsupported contract and add cleanup work for no product value. + +## Risks / Trade-offs + +- [Call sites missed during the sweep] -> Use type checking and the full test suite to catch remaining bare constructors or optional remote call paths. +- [Tests depended on borrowing remote-aware ops for local setup] -> Move those helpers to `BaseOps` or equivalent local-only primitives. +- [Broader API tightening may surface more compile-time churn] -> Keep the change scoped to remote-aware surfaces and update all in-repo callers together. diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/proposal.md b/openspec/changes/archive/2026-04-26-require-remote-config/proposal.md new file mode 100644 index 0000000..376140b --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/proposal.md @@ -0,0 +1,25 @@ +## Why + +Remote-backed operations now assume a configured remote root, but parts of the codebase and tests still model remote arguments and config as optional. That mismatch obscures the current contract, complicates types, and leaves dead fallback paths in APIs that should require explicit remote configuration. + +## What Changes + +- **BREAKING** Make remote configuration required in runtime and ops surfaces that depend on remote-backed behavior. +- Remove `Optional`, `| None`, and `None` defaults from remote-root and remote-config parameters used by remote-aware components. +- Update constructors, helpers, and tests to always pass explicit remote configuration where remote-backed behavior is exercised. +- Remove unsupported code paths that model missing remote configuration for components that always require it. +- Keep non-remote code paths using non-remote primitives where only local transaction access is needed. + +## Capabilities + +### New Capabilities +- `required-remote-config`: Define the contract that remote-aware runtime and ops components require explicit remote configuration rather than optional remote arguments. + +### Modified Capabilities +- `execution-state`: Tighten the existing remote-root requirement so remote-backed execution helpers are modeled consistently as always requiring a valid remote root. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/*`, `src/daggerml/_internal/exec_state.py`, runtime/config call sites, and tests/helpers that construct remote-aware ops. +- Affected APIs: constructors and helpers for remote-aware components now require explicit remote config values. +- Affected systems: remote execution, remote cache operations, and any adapter/test helper that bootstraps remote-aware ops. diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/specs/execution-state/spec.md b/openspec/changes/archive/2026-04-26-require-remote-config/specs/execution-state/spec.md new file mode 100644 index 0000000..62409c4 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/specs/execution-state/spec.md @@ -0,0 +1,12 @@ +## MODIFIED Requirements + +### Requirement: ExecutionState constructed from remote_root +The system SHALL accept `remote_root: str` as a required configuration parameter for `ExecutionState`. Call sites that construct `ExecutionState` MUST provide a valid remote root explicitly and MUST NOT rely on optional remote-root values or `None` defaults. + +#### Scenario: remote_root parsed to bucket and prefix +- **WHEN** `ExecutionState(cache_key, remote_root="s3://my-bucket/my/prefix")` is constructed +- **THEN** lock operations target `s3://my-bucket/my/prefix/exec/{cache_key}.json` + +#### Scenario: call site provides explicit remote_root +- **WHEN** code constructs `ExecutionState` for a remote-backed execution flow +- **THEN** that call site passes a concrete `remote_root: str` value at construction time diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/specs/required-remote-config/spec.md b/openspec/changes/archive/2026-04-26-require-remote-config/specs/required-remote-config/spec.md new file mode 100644 index 0000000..cf04853 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/specs/required-remote-config/spec.md @@ -0,0 +1,19 @@ +## ADDED Requirements + +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST NOT model remote configuration as optional and MUST NOT provide `None` defaults for required remote parameters. + +#### Scenario: Remote-aware ops constructor requires remote root +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete remote root argument rather than an optional remote-root value + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +### Requirement: Local-only setup uses local-only primitives +The system SHALL use local-only primitives for code paths that only need local transaction or repository setup behavior and do not perform remote-backed operations. + +#### Scenario: Local setup helper avoids remote-aware constructor +- **WHEN** a helper only creates local commits, heads, trees, or transactions +- **THEN** it uses a local-only primitive instead of constructing a remote-aware ops type without remote configuration diff --git a/openspec/changes/archive/2026-04-26-require-remote-config/tasks.md b/openspec/changes/archive/2026-04-26-require-remote-config/tasks.md new file mode 100644 index 0000000..47272d1 --- /dev/null +++ b/openspec/changes/archive/2026-04-26-require-remote-config/tasks.md @@ -0,0 +1,17 @@ +## 1. Tighten Remote-Aware Signatures + +- [x] 1.1 Identify remote-aware runtime and ops constructors/helpers that still model remote config as optional. +- [x] 1.2 Remove `Optional`, `| None`, and `None` defaults from required remote-root and remote-config parameters. +- [x] 1.3 Keep local-only code paths on local-only primitives instead of weakening remote-aware interfaces. + +## 2. Update Call Sites + +- [x] 2.1 Update in-repo runtime and ops call sites to pass explicit remote configuration wherever remote-backed behavior is used. +- [x] 2.2 Update test fixtures and helper scripts that currently construct remote-aware ops without remote config. +- [x] 2.3 Replace local setup uses of remote-aware ops with `BaseOps` or equivalent local-only primitives where no remote behavior is needed. + +## 3. Verify Contract + +- [x] 3.1 Update or add tests that assert remote-aware interfaces are constructed with explicit remote config. +- [x] 3.2 Run pyright and the relevant pytest coverage for remote-aware runtime and ops surfaces. +- [x] 3.3 Confirm no unsupported optional remote-config paths remain in the implementation. diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/.openspec.yaml b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/.openspec.yaml new file mode 100644 index 0000000..1b4051e --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-27 diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/design.md b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/design.md new file mode 100644 index 0000000..3306928 --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/design.md @@ -0,0 +1,59 @@ +## Context + +Current git-like project commands are implemented partly in `src/daggerml/_cli/project.py` and partly in internal ops classes. This splits orchestration concerns across layers, duplicates repository/remote wiring logic, and makes behavior-level tests depend on CLI internals instead of stable internal interfaces. + +The target architecture in repository docs is a thin CLI surface over internal operations (`DmlOps` facade + specialized ops modules). This change applies that layering rule consistently to git-like project commands. + +## Goals / Non-Goals + +**Goals:** +- Keep `src/daggerml/_cli/` handlers focused on parsing/validation and single-call delegation. +- Move git-like project operation orchestration into `DmlOps` methods that coordinate existing `CommitOps` and `RemoteOps` behavior. +- Preserve current command semantics and JSON error behavior while changing ownership boundaries. +- Make tests assert behavior through `DmlOps` boundaries instead of CLI-private helper functions. + +**Non-Goals:** +- Redefine merge/revert/fetch protocol semantics already owned by `CommitOps`/`RemoteOps`. +- Introduce new user-facing command flags or alter command output shapes. +- Redesign remote URI formats or project config schema. + +## Decisions + +### 1) Add git-like project entrypoints on `DmlOps` +`DmlOps` will expose explicit methods for project-level workflows (`fetch`, `pull`, `push`, `checkout`, `merge`, `revert`, and clone composition support). These methods will own cross-subsystem coordination and shared helper logic currently in CLI utilities. + +Alternative considered: move logic directly into `CommitOps`/`RemoteOps` only. Rejected because these workflows span both subsystems plus project config concerns; `DmlOps` is the existing facade intended for orchestration. + +### 2) Keep CLI handlers as strict adapters +`src/daggerml/_cli/project.py` command functions remain responsible for argparse-facing input parsing only, then call one `DmlOps` method and return serialized results. + +Alternative considered: leave mixed helper functions in CLI for practicality. Rejected because it weakens the thin-CLI contract and increases duplication risk. + +### 3) Preserve existing command contracts while moving ownership +The refactor will keep observable command behavior stable (success payload fields, detached/attached checkout reporting, and failure patterns). Any behavior changes must be explicitly captured in specs and tests. + +Alternative considered: take opportunity to simplify command outputs. Rejected for this change to minimize migration risk. + +## Risks / Trade-offs + +- [Risk] Refactor unintentionally changes edge-case behavior for revision resolution or checkout mode detection. + → Mitigation: add parity tests for key scenarios (branch checkout, detached checkout, unresolved revision, clone branch/tag). + +- [Risk] `DmlOps` grows too broad if orchestration methods become overly large. + → Mitigation: keep `DmlOps` methods thin coordinators that call focused private helpers or existing ops methods. + +- [Risk] CLI tests may become brittle during transition. + → Mitigation: update tests to assert delegation boundaries and user-visible outputs, not helper implementation details. + +## Migration Plan + +1. Add new `DmlOps` project-operation methods and supporting internal helpers. +2. Update CLI project handlers to call those methods directly. +3. Port/add tests to validate both delegation and user-visible parity. +4. Run targeted CLI and internal ops test suites; fix regressions before merge. + +Rollback: revert the `DmlOps` delegation changes and restore prior CLI helper ownership if parity breaks cannot be resolved in-scope. + +## Open Questions + +- Whether clone-specific setup concerns (filesystem/project-layout steps) should eventually be split into a dedicated internal project service versus remaining as a `DmlOps` method. diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/proposal.md b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/proposal.md new file mode 100644 index 0000000..679a308 --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/proposal.md @@ -0,0 +1,24 @@ +## Why + +Git-like project workflows are currently split between CLI handlers and internal ops, which makes command behavior harder to maintain and test consistently. We need a single internal operation entrypoint for these workflows so the CLI remains a thin parser/dispatcher layer. + +## What Changes + +- Move git-like project command orchestration (`fetch`, `pull`, `push`, `checkout`, `merge`, `revert`, and `clone` composition logic) behind `DmlOps` methods. +- Reduce `src/daggerml/_cli/` command handlers to argument parsing, basic input normalization, and calling one `DmlOps` method per command. +- Keep user-visible command semantics and error surfaces aligned with current git-like behavior while relocating implementation ownership. +- Add or update tests to enforce thin CLI routing and `DmlOps` ownership for git-like project operations. + +## Capabilities + +### New Capabilities +- `thin-cli-routing`: Define a requirement that git-like project command handlers in `_cli` are thin wrappers that delegate operational behavior to `DmlOps`. + +### Modified Capabilities +- `git-like-commit-ops`: Clarify that git-like project operations are executed by internal ops-owned methods surfaced via `DmlOps`, with CLI acting as a transport layer. + +## Impact + +- Affected code: `src/daggerml/_cli/project.py`, `src/daggerml/_cli/base.py`, and `src/daggerml/_internal/ops/__init__.py` (plus any extracted internal ops helpers). +- Affected tests: CLI project command tests and internal ops tests around git-like flows. +- API impact: no new end-user CLI flags required; behavior remains compatible while ownership shifts to internal ops. diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..ac7e237 --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/git-like-commit-ops/spec.md @@ -0,0 +1,16 @@ +## ADDED Requirements + +### Requirement: Git-like project workflows are owned by DmlOps orchestration +Git-like project command workflows SHALL execute through `DmlOps` orchestration methods that coordinate commit and remote operations without requiring CLI-owned business logic. + +#### Scenario: Pull executes through DmlOps workflow +- **WHEN** a caller invokes project pull with remote target, head ref, and user context +- **THEN** `DmlOps` resolves project context, performs remote synchronization, and applies merge behavior through internal ops + +#### Scenario: Push executes through DmlOps workflow +- **WHEN** a caller invokes project push with remote target and push options +- **THEN** `DmlOps` performs project-aware remote push behavior and returns the push result without CLI-managed remote orchestration + +#### Scenario: Revert executes through DmlOps workflow +- **WHEN** a caller invokes project revert with revision, head ref, and user context +- **THEN** `DmlOps` resolves the revision and performs revert behavior through internal commit operations diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/thin-cli-routing/spec.md b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/thin-cli-routing/spec.md new file mode 100644 index 0000000..ce61b00 --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/specs/thin-cli-routing/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: CLI project commands delegate to a single DmlOps method +The `dml` CLI project command handlers SHALL remain thin adapters that parse command arguments and invoke exactly one `DmlOps` method per command path. + +#### Scenario: Fetch delegates through DmlOps +- **WHEN** a user runs `dml fetch [branch]` +- **THEN** the CLI handler parses inputs and calls one `DmlOps` fetch workflow method that performs remote synchronization behavior + +#### Scenario: Checkout delegates through DmlOps +- **WHEN** a user runs `dml checkout ` +- **THEN** the CLI handler parses the revision and calls one `DmlOps` checkout workflow method that returns attached/detached result details + +#### Scenario: Merge delegates through DmlOps +- **WHEN** a user runs `dml merge --head --user ` +- **THEN** the CLI handler calls one `DmlOps` merge workflow method and does not instantiate commit/remote ops directly + +### Requirement: CLI does not own git-like project business logic +The `_cli` layer SHALL NOT contain git-like project orchestration logic that coordinates repository state, commit resolution, or remote protocol execution. + +#### Scenario: Project logic relocation +- **WHEN** git-like project command behavior requires cross-subsystem coordination +- **THEN** the implementation resides in `DmlOps` (and internal ops it invokes), while CLI code remains argument parsing and result forwarding only + +### Requirement: Clone command composes via DmlOps workflow +The clone CLI entrypoint SHALL delegate clone workflow composition to a single `DmlOps` method after input parsing and command-level validation. + +#### Scenario: Clone branch flow delegation +- **WHEN** a user runs `dml clone dml://alice/demo#main --bucket my-bucket` +- **THEN** the CLI entrypoint delegates to one `DmlOps` clone workflow method that performs fetch and checkout composition and returns clone result metadata + +#### Scenario: Clone tag flow delegation +- **WHEN** a user runs `dml clone dml://alice/demo@v1.0 --bucket my-bucket` +- **THEN** the CLI entrypoint delegates to one `DmlOps` clone workflow method that performs fetch and detached checkout semantics through internal ops diff --git a/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/tasks.md b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/tasks.md new file mode 100644 index 0000000..a5b946b --- /dev/null +++ b/openspec/changes/archive/2026-04-27-thin-cli-git-ops-routing/tasks.md @@ -0,0 +1,17 @@ +## 1. DmlOps workflow surface + +- [x] 1.1 Add git-like project workflow methods to `DmlOps` for `fetch`, `pull`, `push`, `checkout`, `merge`, and `revert` orchestration. +- [x] 1.2 Add clone workflow support in `DmlOps` that composes fetch + checkout while preserving current branch/tag semantics. +- [x] 1.3 Move or rehome CLI-local helper logic (project config loading, remote URI mapping, revision resolution helpers) into internal ops-owned code paths used by `DmlOps`. + +## 2. Thin CLI refactor + +- [x] 2.1 Update `src/daggerml/_cli/project.py` handlers so each command parses args and calls a single `DmlOps` method. +- [x] 2.2 Remove direct `CommitOps`/`RemoteOps` instantiation from `_cli` project handlers. +- [x] 2.3 Ensure `src/daggerml/_cli/base.py` command dispatch continues routing git-like operations via `DmlOps` without embedding business logic. + +## 3. Verification and regression coverage + +- [x] 3.1 Add/update CLI tests to assert git-like project commands remain thin delegates and preserve output/error contracts. +- [x] 3.2 Add/update internal ops tests for new `DmlOps` workflow methods, including checkout mode and merge/revert/pull/push orchestration paths. +- [x] 3.3 Run targeted test suites for CLI project commands and internal ops workflows; fix parity regressions before completion. diff --git a/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/.openspec.yaml b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/design.md b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/design.md new file mode 100644 index 0000000..e5dc1b1 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/design.md @@ -0,0 +1,54 @@ +## Context + +`DmlOps.init` currently performs several project-layout responsibilities inline: creating `.dml/`, writing `.dml/.gitignore`, and writing `.dml/config.toml` (when absent). A shared helper, `init_project_layout(project_dir, cfg)`, already exists in internal config code and performs this bootstrap work. + +The current duplication increases the chance that layout or config-writing behavior diverges between call sites. This is especially risky in init recovery paths where config and db presence are intentionally checked and acted on separately. + +## Goals / Non-Goals + +**Goals:** +- Route `DmlOps.init` layout/bootstrap creation through `init_project_layout`. +- Preserve existing init contract: arguments, returned keys/values, and error behavior. +- Keep recovery behavior intact (including when pull is required). +- Remove now-unused private helper code in `DmlOps` once delegation is complete. + +**Non-Goals:** +- Changing CLI/API option semantics for init. +- Altering project URI derivation rules or remote requirements. +- Changing db creation strategy beyond reusing existing helper paths. + +## Decisions + +### Delegate layout bootstrap to `init_project_layout` +- **Decision:** Use `init_project_layout(root, DmlProjectConfig(...))` in `DmlOps.init` for writing `.dml/.gitignore`, `.dml/config.toml`, and ensuring `.dml/db/` exists. +- **Rationale:** Centralizes initialization layout logic into one shared internal implementation. +- **Alternative considered:** Keep inline writes and only call helper for new init flows. Rejected because partial delegation preserves duplication and drift risk. + +### Preserve config-exists and db-exists gating semantics +- **Decision:** Continue deriving `config_exists`, `db_exists`, and recovery mode before mutation so pull/no-pull behavior remains unchanged. +- **Rationale:** Existing behavior is covered by recovery specs and tests; this refactor should not alter functional outcomes. +- **Alternative considered:** Recompute existence checks after helper call. Rejected because that can blur recovery-state detection and change pull triggering. + +### Remove duplicated private helpers when obsolete +- **Decision:** Remove helper methods in `DmlOps` that only support the previous inline layout-writing path after equivalent behavior is routed through shared config utilities. +- **Rationale:** Reduces maintenance surface and future inconsistency. +- **Alternative considered:** Keep old helpers as wrappers. Rejected because wrappers can hide dead paths and reintroduce duplicate behavior. + +## Risks / Trade-offs + +- [Helper invocation could write config when not intended] -> Mitigation: only invoke layout helper in the same branch where init currently creates missing config/db state. +- [Slightly tighter coupling from ops module to config module helper] -> Mitigation: coupling already exists through `DmlConfig`/`DmlProjectConfig`; this change reuses that boundary. +- [Behavior drift in `.gitignore` or config formatting] -> Mitigation: preserve helper output contract and verify with existing init workflow tests. + +## Migration Plan + +1. Refactor `DmlOps.init` to construct a `DmlProjectConfig` from resolved init config and invoke `init_project_layout` when bootstrap creation is needed. +2. Remove obsolete private helper methods and update imports. +3. Update/adjust internal tests that asserted inline behavior details. +4. Run init-focused test suites to confirm no contract changes. + +Rollback strategy: revert the refactor commit to restore prior inline layout implementation. + +## Open Questions + +- None identified; scope is an internal refactor with contract preservation. diff --git a/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/proposal.md b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/proposal.md new file mode 100644 index 0000000..6e67765 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/proposal.md @@ -0,0 +1,24 @@ +## Why + +`DmlOps.init` currently duplicates project-layout setup behavior that already exists in `init_project_layout`, which increases maintenance cost and makes init recovery logic harder to reason about. +Consolidating init-time layout creation through the shared helper reduces drift risk while preserving the current external init contract. + +## What Changes + +- Refactor `DmlOps.init` to call `init_project_layout` for `.dml` directory/bootstrap config setup. +- Preserve the current `DmlOps.init` interface, return payload, validation behavior, and recovery flow. +- Remove now-unused helper code and duplicated layout-writing paths that become obsolete after delegation. + +## Capabilities + +### New Capabilities +None. + +### Modified Capabilities +- `shared-internal-configuration`: Clarify that init layout/bootstrap file creation is delegated to shared internal project-layout helpers while preserving existing init semantics. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/__init__.py`, `src/daggerml/_internal/config.py`, and init-focused internal tests. +- API/CLI contract impact: none expected; init inputs/outputs and error semantics remain unchanged. +- Dependencies/systems: no new dependencies; this is an internal refactor and dead-code cleanup. diff --git a/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..71deee4 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/specs/shared-internal-configuration/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Multiple config sources normalize into the shared internal model +The system SHALL treat explicit arguments, environment variables, project-local config, and global config as sources that feed the shared internal configuration model. Source-specific loading may differ, but normalization and precedence MUST be centralized in the shared internal resolver. + +#### Scenario: Project-local and global config feed shared resolution +- **WHEN** a frontend resolves configuration for an operation in a project directory +- **THEN** project-local `.dml/config.toml` and any applicable global config inputs are loaded as sources for the same shared internal resolution path + +#### Scenario: Environment values are normalized centrally +- **WHEN** configuration is resolved from environment variables +- **THEN** the shared internal resolver, not the frontend, maps those values into the canonical internal configuration model + +#### Scenario: Init project layout creation delegates to shared internal helper +- **WHEN** `DmlOps.init` must create missing project layout artifacts for a local project +- **THEN** it delegates filesystem bootstrap work to shared internal project-layout helper logic instead of duplicating directory and config-file writes in ops code diff --git a/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/tasks.md b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/tasks.md new file mode 100644 index 0000000..a1a6486 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-dmlops-init-use-init-project-layout/tasks.md @@ -0,0 +1,14 @@ +## 1. Refactor init layout bootstrap + +- [x] 1.1 Update `DmlOps.init` in `src/daggerml/_internal/ops/__init__.py` to use `init_project_layout` for `.dml` directory, `.gitignore`, and local config bootstrap when initialization needs to create missing layout artifacts. +- [x] 1.2 Keep existing init contract intact by preserving argument handling, default URI derivation, validation errors, return payload keys, and recovery pull gating behavior. + +## 2. Remove obsolete duplicated code + +- [x] 2.1 Remove `DmlOps` private helpers that become unused after delegating bootstrap writes (for example inline config/gitignore-writing helpers), and update imports/usages accordingly. +- [x] 2.2 Ensure no remaining duplicated filesystem bootstrap logic exists in `DmlOps.init` that overlaps with `init_project_layout`. + +## 3. Verify behavior with tests + +- [x] 3.1 Update or add tests in init-focused internal suites to assert contract-preserving behavior after helper delegation, including recovery-mode behavior. +- [x] 3.2 Run targeted tests for init and project workflow paths to confirm refactor correctness and unchanged external semantics. diff --git a/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/.openspec.yaml b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/design.md b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/design.md new file mode 100644 index 0000000..c5bfcfb --- /dev/null +++ b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/design.md @@ -0,0 +1,51 @@ +## Context + +`DmlOps.init` currently receives identity inputs that can overlap (`name`, `remote_project`) and existing behavior relies on implicit assumptions about how project URI is chosen. The requested change introduces an explicit identity selection contract: either the caller provides `remote_project`, or the caller provides `name` and the system derives URI ownership from resolved global config user. The implementation must preserve existing successful URI-based init flows while improving validation and user-facing diagnostics for unresolved user identity. + +## Goals / Non-Goals + +**Goals:** +- Enforce mutual exclusivity of `name` and `remote_project` in init input validation. +- Allow `name` to be omitted when explicit `remote_project` is provided. +- Derive canonical project URI from `name` and resolved global config user when `name` is provided. +- Produce descriptive, deterministic errors when user resolution is required but unavailable. + +**Non-Goals:** +- Changing project URI schema or owner normalization rules beyond current contracts. +- Altering remote URI defaults or non-identity init configuration behavior. +- Expanding init to support additional identity sources. + +## Decisions + +- **Single identity source gate in `DmlOps.init`**: Centralize validation so exactly one of (`name`, `remote_project`) is used for project identity. This avoids duplicated logic between CLI and internal call paths. + - Alternative considered: enforce only at CLI parsing level; rejected because programmatic callers of `DmlOps.init` would bypass constraints. + +- **Derive URI only from `name` + resolved user**: If `name` is present, compute project URI using current owner derivation from global config user, and fail if user is unresolved. + - Alternative considered: fallback to anonymous/default owner when user missing; rejected because it silently mutates identity semantics and can produce surprising project ownership. + +- **Preserve explicit URI authority**: If `remote_project` is provided, treat it as authoritative and do not derive from `name`. + - Alternative considered: allow both with precedence rules; rejected because precedence masks user mistakes and weakens contract clarity. + +- **Improve error phrasing for unresolved user path**: Raise explicit repository/config errors that explain why initialization cannot proceed and what input mode avoids the requirement. + - Alternative considered: generic "invalid init arguments" error; rejected as insufficiently actionable. + +## Risks / Trade-offs + +- **[Risk] Backward compatibility for callers passing both fields** → Mitigation: fail fast with explicit mutual-exclusion message so migration is straightforward. +- **[Risk] Existing tests may encode old required-name assumptions** → Mitigation: update/init tests to cover URI-only path, name-derived path, and unresolved-user failure path. +- **[Trade-off] Stricter validation may surface misconfigurations earlier** → Mitigation: provide actionable error text including required mode (`--name` vs `--remote-project`) and missing user guidance. + +## Migration Plan + +- Update init argument validation and URI derivation path in `DmlOps.init`. +- Align CLI-facing argument docs/help text with new contract. +- Add or update tests for: + - URI-only initialization with omitted name. + - Name-only initialization deriving URI from resolved user. + - Rejection of simultaneous `name` and `remote_project`. + - Rejection when `name` requires user resolution but user is unresolved. +- No data migration needed; change affects init-time validation/derivation only. + +## Open Questions + +- None; requested contract is explicit and can be implemented directly. diff --git a/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/proposal.md b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/proposal.md new file mode 100644 index 0000000..911c3a0 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/proposal.md @@ -0,0 +1,25 @@ +## Why + +The `dml init` inputs currently force users to provide a project name even when they already have a canonical project URI, and they do not clearly enforce a single source of truth between name and URI. This makes initialization ergonomics and validation behavior inconsistent with project identity expectations. + +## What Changes + +- Make `name` optional in `init` so callers can initialize from explicit URI-only inputs. +- Make `name` and `remote_project` mutually exclusive at the `init` contract boundary. +- When `name` is provided, derive `remote_project` from `name` plus resolved global config user. +- Raise a descriptive repository/config error when `name` is provided but global config user is unresolved. +- Keep existing behavior for explicit `remote_project` initialization paths. + +## Capabilities + +### New Capabilities +- `init-input-normalization`: Normalize and validate `init` identity inputs so exactly one identity source is used and derived URI behavior is deterministic. + +### Modified Capabilities +- None. + +## Impact + +- Affected code: init CLI handler and `DmlOps.init` identity validation/derivation paths. +- Affected APIs: initialization argument semantics (`name` optional, `name`/`remote_project` exclusivity). +- Error behavior: clearer user-facing failure when user identity cannot be resolved for name-based initialization. diff --git a/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/specs/init-input-normalization/spec.md b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/specs/init-input-normalization/spec.md new file mode 100644 index 0000000..56193d8 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/specs/init-input-normalization/spec.md @@ -0,0 +1,29 @@ +## ADDED Requirements + +### Requirement: Init identity inputs are mutually exclusive +The init operation MUST reject requests that provide both a project name and an explicit project URI, and it MUST return a descriptive validation error that explains only one identity source can be used. + +#### Scenario: Name and project URI are both provided +- **WHEN** init is called with both `name` and `remote_project` +- **THEN** init fails with an error stating these inputs are mutually exclusive and one must be removed + +### Requirement: Init accepts URI-only identity +The init operation MUST allow `name` to be omitted when `remote_project` is provided and MUST initialize project identity from the explicit URI. + +#### Scenario: Project URI without name +- **WHEN** init is called with `remote_project` and no `name` +- **THEN** init succeeds and project configuration uses the provided project URI + +### Requirement: Init derives URI from name using resolved user +When init is called with `name` and without `remote_project`, the system MUST resolve the global config user and derive the canonical project URI from that user and the provided name. + +#### Scenario: Name-only init with resolved user +- **WHEN** init is called with `name`, no `remote_project`, and a resolvable global config user +- **THEN** init succeeds and stores a project URI derived from the resolved user and provided name + +### Requirement: Name-based init fails when user cannot be resolved +When init is called with `name` and without `remote_project`, and global config user cannot be resolved, init MUST fail with a descriptive configuration error explaining that name-based init requires a resolved user identity. + +#### Scenario: Name-only init with unresolved user +- **WHEN** init is called with `name`, no `remote_project`, and no resolvable global config user +- **THEN** init fails with an error that states user resolution is required for name-derived project URI generation diff --git a/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/tasks.md b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/tasks.md new file mode 100644 index 0000000..522cd59 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-init-optional-name-uri-exclusive/tasks.md @@ -0,0 +1,18 @@ +## 1. Init identity contract updates + +- [x] 1.1 Update `DmlOps.init` validation so `name` and `remote_project` are mutually exclusive. +- [x] 1.2 Allow `name` to be omitted when `remote_project` is supplied and preserve explicit URI authority. +- [x] 1.3 Implement name-based `remote_project` derivation via resolved global config user. +- [x] 1.4 Add explicit failure for unresolved user in name-based init with actionable error text. + +## 2. CLI and config integration + +- [x] 2.1 Align init CLI argument/help semantics with optional `name` and exclusivity rules. +- [x] 2.2 Ensure init command paths surface the new validation/configuration error messages consistently. + +## 3. Test coverage + +- [x] 3.1 Add/adjust tests for URI-only init without `name`. +- [x] 3.2 Add/adjust tests for name-only init deriving URI from resolved user. +- [x] 3.3 Add/adjust tests for rejection when both `name` and `remote_project` are provided. +- [x] 3.4 Add/adjust tests for rejection when name-based init cannot resolve global config user. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/.openspec.yaml b/openspec/changes/archive/2026-04-29-remove-clone-functionality/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/design.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/design.md new file mode 100644 index 0000000..81fc9f1 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/design.md @@ -0,0 +1,60 @@ +## Context + +`clone` behavior currently spans CLI routing, `DmlOps` orchestration, and remote/project bootstrap rules. This creates duplicate bootstrap paths alongside `init`, plus clone-only hook/config branches that increase complexity and weaken the thin-wrapper CLI boundary. The change removes clone end-to-end and keeps `init` as the only repository bootstrap flow, while preserving existing non-clone git-like operations (`fetch`, `checkout`, `pull`, `push`, `merge`, `revert`). + +Constraints: +- No backward compatibility for clone behavior or aliases. +- CLI must remain a thin adapter over public `daggerml._internal` APIs. +- Init behavior and recovery guarantees must remain intact. + +## Goals / Non-Goals + +**Goals:** +- Remove all user-facing and internal clone entrypoints and data paths. +- Keep project lifecycle coherent around `init` plus explicit remote synchronization commands. +- Eliminate dead code and clone-only test/doc surface. +- Tighten CLI architecture so handlers only parse arguments and call one internal API path. + +**Non-Goals:** +- Replacing clone with a new single-command bootstrap workflow in this change. +- Adding migration shims, deprecation windows, or compatibility aliases. +- Redesigning non-clone remote protocol semantics. + +## Decisions + +1. Remove clone at the contract layer first (OpenSpec deltas), then enforce in implementation. + - Rationale: specs become the source of truth for deleting behavior and guide required code/test/doc removals. + - Alternative considered: removing code first and backfilling specs; rejected because it risks partial behavior drift. + +2. Preserve `init` as the sole bootstrap primitive and require explicit remote actions after init. + - Rationale: simpler mental model and clearer failure boundaries; avoids hidden fetch/checkout side effects. + - Alternative considered: folding clone semantics into init flags; rejected to keep scope focused and avoid reintroducing implicit bootstrap orchestration. + +3. Keep CLI command modules as thin wrappers over internal APIs and forbid CLI-owned orchestration. + - Rationale: consolidates business logic in internal APIs, reducing duplication and improving testability. + - Alternative considered: retaining limited CLI composition for convenience; rejected because it weakens layering and recreates clone-like coupling. + +4. Remove clone-specific hooks/config branches entirely instead of leaving inert fields. + - Rationale: hard removal avoids dead configuration contracts and accidental future reuse. + - Alternative considered: retaining `post-clone` config as ignored/no-op; rejected due to backward-compat burden and ambiguous UX. + +## Risks / Trade-offs + +- Users depending on `dml clone` lose a one-step workflow immediately -> Provide clear docs/tasks updates describing init + fetch/checkout alternatives. +- Clone code may share helpers with non-clone flows, creating accidental regressions during removal -> Refactor shared helpers first, then remove clone-only branches with targeted test updates. +- Thin-wrapper enforcement can surface hidden coupling in CLI modules -> Move orchestration into internal APIs and keep command handlers argument-only. +- Spec cleanup across multiple capabilities can miss references -> Use capability deltas for each impacted spec and ensure no clone requirements remain. + +## Migration Plan + +1. Update capability specs to delete clone requirements and codify init-only bootstrap + thin CLI constraints. +2. Remove clone command wiring and internal clone methods/ops; refactor any shared helpers needed by remaining commands. +3. Remove clone tests and rewrite affected assertions toward init + explicit remote workflow behavior. +4. Remove clone documentation and hook/config references; update CLI/help text to exclude clone. +5. Run full project test suites for CLI/internal ops and verify no clone paths remain reachable. + +Rollback strategy: revert this change set as a whole if removal breaks critical workflows; partial rollback is discouraged because contract and implementation must stay aligned. + +## Open Questions + +- None; the change explicitly requires hard removal with no backward compatibility. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/proposal.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/proposal.md new file mode 100644 index 0000000..3841cdb --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/proposal.md @@ -0,0 +1,28 @@ +## Why + +The current `clone` surface duplicates initialization and project bootstrap pathways, increasing maintenance cost and creating split behavior between CLI routing and internal orchestration. Removing clone now simplifies the product model to an init-first workflow and enforces the intended architecture where CLI commands are thin adapters over `daggerml._internal` public APIs. + +## What Changes + +- **BREAKING**: Remove the `dml clone` command from the CLI and all internal clone orchestration paths. +- **BREAKING**: Remove `clone`-specific behavior from `DmlOps` and internal ops/contracts; do not keep compatibility shims or aliases. +- Preserve and harden `init` as the only project bootstrap entrypoint. +- Refactor CLI project commands to remain thin wrappers that delegate directly to supported `daggerml._internal` APIs with no embedded workflow logic. +- Remove dead code, tests, docs references, and configuration/hook branches that only exist for clone flows. + +## Capabilities + +### New Capabilities +None. + +### Modified Capabilities +- `thin-cli-routing`: remove clone delegation requirements and strengthen requirement that CLI project commands are thin wrappers over internal APIs only. +- `git-like-commit-ops`: remove clone composition requirements (`fetch` then `checkout`) and unsupported clone-target semantics. +- `remote-project-refs`: remove clone-related initialization/origin recording requirements while preserving init and non-clone remote workflows. + +## Impact + +- Affected code: `src/daggerml/_cli/**`, `src/daggerml/api.py` (`DmlOps`), and internal ops modules supporting clone orchestration. +- Affected tests: CLI and internal operation tests that cover clone behavior need removal or rewrite toward init+fetch/checkout workflows. +- Affected docs/specs: clone references in OpenSpec capabilities and user-facing CLI docs must be removed. +- User impact: existing clone-based workflows are no longer available; users must initialize projects with `init` and use explicit fetch/checkout/pull flows. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..8f1c05a --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/git-like-commit-ops/spec.md @@ -0,0 +1,5 @@ +## REMOVED Requirements + +### Requirement: Clone composes fetch then checkout +**Reason**: Clone is intentionally removed with no backward compatibility; bootstrap behavior is now modeled through explicit init and subsequent git-like commands. +**Migration**: Use `dml init` to create local project state, then run `dml fetch` and `dml checkout` (or `dml pull`) explicitly. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/remote-project-refs/spec.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..4803055 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/remote-project-refs/spec.md @@ -0,0 +1,61 @@ +## MODIFIED Requirements + +### Requirement: Project directory initialization +The system SHALL initialize local project state under `/.dml/` for `init`. + +#### Scenario: Init creates DML directory +- **WHEN** `dml init demo` succeeds +- **THEN** the system creates `demo/.dml/`, `demo/.dml/config.toml`, and local database storage under `demo/.dml/db/` + +#### Scenario: Init refuses existing child directory +- **WHEN** `dml init demo` runs and `demo/` already exists +- **THEN** init fails and instructs the user to initialize that directory with `dml init --here demo` + +#### Scenario: Init here creates DML directory in current directory +- **WHEN** `dml init --here demo` succeeds from the current directory +- **THEN** the system creates `.dml/`, `.dml/config.toml`, and local database storage under `.dml/db/` + +#### Scenario: Init here uses provided project name +- **WHEN** `dml init --here demo` succeeds from directory `workdir` +- **THEN** the local project name is `demo` + +#### Scenario: Init creates DML gitignore +- **WHEN** `dml init demo` succeeds +- **THEN** the system writes `demo/.dml/.gitignore` containing `*` + +#### Scenario: Init creates initial branch +- **WHEN** `dml init demo` succeeds +- **THEN** local storage contains an initial empty commit/tree and the current branch is `main` + +### Requirement: Init shell hooks +The system SHALL support `post-init` shell hooks from global DML config that run in the project directory after `.dml/` exists. + +#### Scenario: Init hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init demo` runs +- **THEN** the hook command runs in the `demo` project directory after `demo/.dml/` exists + +#### Scenario: Init here hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init --here demo` runs +- **THEN** the hook command runs in the current directory after `.dml/` exists + +#### Scenario: Hooks run in configured order +- **WHEN** multiple `post-init` hook commands are configured and `dml init demo` runs +- **THEN** the hook commands run in their configured list order + +#### Scenario: Init no-hooks skips hooks +- **WHEN** `dml init --no-hooks demo` runs +- **THEN** no `post-init` hook commands run + +#### Scenario: Hook environment is provided +- **WHEN** a `post-init` hook command runs +- **THEN** the process environment includes `DML_HOOK`, `DML_PROJECT_HOME`, `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, `DML_CONFIG_HOME`, and `DML_BRANCH` + +## REMOVED Requirements + +### Requirement: Init and clone shell hooks +**Reason**: Clone hooks are removed because clone is no longer a supported workflow. +**Migration**: Keep bootstrap automation in `post-init` hooks and run explicit remote commands after initialization when needed. + +### Requirement: Clone records origin +**Reason**: Clone is removed; automatic origin recording during clone no longer applies. +**Migration**: Configure remotes through init-time/project config workflows and use explicit fetch/push/pull commands. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/thin-cli-routing/spec.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/thin-cli-routing/spec.md new file mode 100644 index 0000000..b863550 --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/specs/thin-cli-routing/spec.md @@ -0,0 +1,26 @@ +## MODIFIED Requirements + +### Requirement: CLI project commands delegate to a single internal API method +The `dml` CLI project command handlers SHALL remain thin adapters that parse command arguments and invoke exactly one supported `daggerml._internal` API entrypoint per command path. + +#### Scenario: Fetch delegates through internal API +- **WHEN** a user runs `dml fetch [branch]` +- **THEN** the CLI handler parses inputs and calls one internal fetch workflow entrypoint that performs remote synchronization behavior + +#### Scenario: Checkout delegates through internal API +- **WHEN** a user runs `dml checkout ` +- **THEN** the CLI handler parses the revision and calls one internal checkout workflow entrypoint that returns attached/detached result details + +#### Scenario: Merge delegates through internal API +- **WHEN** a user runs `dml merge --head --user ` +- **THEN** the CLI handler calls one internal merge workflow entrypoint and does not instantiate commit/remote ops directly + +#### Scenario: Init delegates through internal API +- **WHEN** a user runs `dml init ` +- **THEN** the CLI handler parses inputs and calls one internal init workflow entrypoint without composing additional bootstrap workflows in CLI code + +## REMOVED Requirements + +### Requirement: Clone command composes via DmlOps workflow +**Reason**: Clone is removed from the product surface to enforce an init-first lifecycle and eliminate duplicate bootstrap orchestration. +**Migration**: Initialize projects with `dml init` and then run explicit remote synchronization commands (`dml fetch`, `dml checkout`, `dml pull`) as needed. diff --git a/openspec/changes/archive/2026-04-29-remove-clone-functionality/tasks.md b/openspec/changes/archive/2026-04-29-remove-clone-functionality/tasks.md new file mode 100644 index 0000000..c219a6e --- /dev/null +++ b/openspec/changes/archive/2026-04-29-remove-clone-functionality/tasks.md @@ -0,0 +1,23 @@ +## 1. Remove clone command and internal entrypoints + +- [x] 1.1 Remove `dml clone` CLI command wiring, argument parsing, and help text from `src/daggerml/_cli/**`. +- [x] 1.2 Remove clone orchestration methods from `src/daggerml/api.py` (`DmlOps`) and eliminate clone-specific internal API surfaces. +- [x] 1.3 Delete clone-only internal ops/modules and imports, refactoring shared helpers so remaining commands compile and run without clone branches. + +## 2. Preserve and enforce init-first workflow + +- [x] 2.1 Ensure `init` remains fully functional as the only bootstrap entrypoint, including recovery behavior for config-present/db-missing states. +- [x] 2.2 Remove clone-specific hook/config pathways (including post-clone handling) while preserving `post-init` behavior and environment contracts. +- [x] 2.3 Update remote/project initialization logic so no clone-origin recording path remains and explicit remote workflows (`fetch`/`checkout`/`pull`) are the only follow-up path. + +## 3. Keep CLI as thin wrappers over internal APIs + +- [x] 3.1 Refactor remaining project command handlers to parse inputs and call exactly one supported `daggerml._internal` API entrypoint per command. +- [x] 3.2 Remove any CLI-owned git-like orchestration code paths uncovered during clone removal. +- [x] 3.3 Verify CLI modules do not directly compose multi-step project workflows outside internal APIs. + +## 4. Remove dead code and update tests/docs + +- [x] 4.1 Remove or rewrite clone-focused tests across CLI/internal suites to assert clone absence and init + explicit remote command behavior. +- [x] 4.2 Remove clone references from user/developer docs and command examples; update workflow guidance to init-first. +- [x] 4.3 Run targeted and full test suites covering CLI routing, init, and remote operations; fix regressions caused by clone removal. diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/.openspec.yaml b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/design.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/design.md new file mode 100644 index 0000000..ea91f5b --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/design.md @@ -0,0 +1,58 @@ +## Context + +`DmlOps.init` currently blends project placement, config setup, and repo bootstrap in ways that make behavior hard to reason about across CLI and API callers. The change introduces three coupled concerns: (1) init location semantics become fixed to the current working project root (`.dml/` here), (2) init-time options must be normalized and validated through the shared internal resolver before side effects, and (3) an existing-config/missing-db edge case must be recoverable without manual repair. + +This is cross-cutting because the behavior touches orchestration in internal ops, caller input contracts, and remote bootstrap behavior (`pull`) that depends on correctly resolved project and remote settings. + +## Goals / Non-Goals + +**Goals:** +- Make `DmlOps.init` deterministic: initialize local project metadata under `.dml/` at the current location only. +- Ensure init inputs use canonical config resolution/validation (`explicit > env > project > global > defaults`) through shared internal config code. +- Fail fast when required config values cannot be resolved to valid values (especially `remote.uri` where required by downstream behavior). +- Support idempotent recovery when `.dml/config.toml` exists but `.dml/db/` is absent by creating DB and syncing project state when `remote.project` exists. + +**Non-Goals:** +- Redesigning full clone/fetch/checkout workflow semantics outside of init bootstrap. +- Introducing new config keys or a new precedence model. +- Changing remote protocol, CAS layout, or merge semantics. + +## Decisions + +- Use local-root-only init semantics. + - Decision: remove directory-creation placement from `DmlOps.init`; it creates `.dml/` in the current project location. + - Rationale: aligns with git-like repository initialization semantics and removes ambiguity around `here`/path interpretation. + - Alternative considered: keep `here` and deprecate later. Rejected because dual behavior would preserve ambiguity and complicate validation and caller contracts. + +- Resolve and validate init config before mutating filesystem state. + - Decision: run `DmlOps.init` options through the shared internal resolver and require canonical resolved fields before writing config or creating DB. + - Rationale: guarantees consistent API/CLI behavior and shifts config failures to the earliest possible point. + - Alternative considered: allow partial init then validate during pull. Rejected because it produces half-initialized states and deferred runtime failures. + +- Treat existing config + missing DB as a supported recovery path. + - Decision: if `.dml/config.toml` exists and `.dml/db/` is absent, init creates DB, then conditionally runs pull when resolved `remote.project` is present. + - Rationale: this state appears after interrupted setup or manual migration; deterministic recovery avoids requiring users to hand-edit local metadata. + - Alternative considered: fail and require manual remediation. Rejected because it increases operator burden and creates avoidable support complexity. + +- Require normalized remote configuration for remote-aware bootstrap. + - Decision: remote-aware init/pull path consumes validated `remote.uri` from shared resolver; invalid or unresolved required values fail init. + - Rationale: enforces `required-remote-config` consistency and avoids hidden env/config probing in downstream components. + - Alternative considered: allow remote URI omission and best-effort pull. Rejected because behavior becomes nondeterministic and failure modes become late. + +## Risks / Trade-offs + +- [Breaking caller behavior for placement options] -> Mitigation: update CLI/API validation and help text to remove `here`/directory-creation mode and document local-root-only semantics. +- [Stricter validation may fail previously tolerated setups] -> Mitigation: provide clear, field-specific errors from resolver-backed validation and preserve precedence rules users already depend on. +- [Auto-pull during recovery may surface remote errors during init] -> Mitigation: make pull conditional on `remote.project` presence and keep failure messages explicit about remote config or connectivity causes. +- [Idempotency regressions in repeated init calls] -> Mitigation: add tests for repeated init on clean, already-initialized, and partial states to verify stable outcomes. + +## Migration Plan + +- Update `DmlOps.init` call contract and implementation first, then align CLI/API entrypoints with the new argument/validation expectations. +- Add/adjust tests for local-root-only init, config validation failures, and recovery flow (`config exists`, `db missing`). +- Rollout is source-compatible for callers that already initialize in-place; callers relying on directory placement must switch to invoking init from target directory. +- Rollback strategy: revert init contract changes and associated spec deltas in one patch if critical compatibility issues arise. + +## Open Questions + +- Should recovery-mode pull be best-effort with warnings or fail-hard when `remote.project` is present but remote configuration is invalid? Current direction is fail-hard for consistency with strict init validation. diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/proposal.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/proposal.md new file mode 100644 index 0000000..45f5c1c --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/proposal.md @@ -0,0 +1,26 @@ +## Why + +`DmlOps.init` still behaves like a directory scaffolder instead of a strict project initializer rooted at the current location. That causes ambiguous project placement and allows invalid or unresolved configuration (especially remote URI) to slip past initialization, which then fails later in pull/sync flows. + +## What Changes + +- Change `DmlOps.init` semantics to initialize only in the current location by creating `.dml/` locally, instead of creating a new project directory from a `here`/path-placement mode. +- Require `DmlOps.init` to accept standard config inputs and resolve them through the shared internal config resolver before proceeding. +- Enforce resolver-backed validation for required values (notably valid `remote.uri` when required by the init flow), and fail fast on invalid/unresolved configuration. +- Add recovery behavior for partially initialized repos: when `.dml/config.toml` exists but `.dml/db/` does not, create the DB and run pull when a project URI is configured. + +## Capabilities + +### New Capabilities +- `dmlops-init-recovery`: Define deterministic recovery behavior for `DmlOps.init` when config exists but local DB is missing. + +### Modified Capabilities +- `shared-internal-configuration`: `DmlOps.init` must resolve and validate init-time config via the shared resolver, including required-field enforcement. +- `git-like-commit-ops`: Project init workflow semantics change to local `.dml/` initialization only, and remove directory-creation placement behavior. +- `required-remote-config`: Init and remote-aware setup paths enforce normalized, validated `remote.uri` from shared resolution rather than optional/late handling. + +## Impact + +- Affected code: `DmlOps` init orchestration and config handoff paths, plus related CLI/API call sites that supply init options. +- Affected behavior: project initialization location, validation timing, and bootstrap/recovery when local DB is absent. +- User-facing impact: clearer init contract (initialize "here" only), earlier actionable errors for bad config, and better auto-recovery for existing `.dml/config.toml` states. diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/dmlops-init-recovery/spec.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/dmlops-init-recovery/spec.md new file mode 100644 index 0000000..c7d978f --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/dmlops-init-recovery/spec.md @@ -0,0 +1,19 @@ +## ADDED Requirements + +### Requirement: Init recovers missing DB when project config already exists +The system SHALL treat `.dml/config.toml` + missing `.dml/db/` as a recoverable initialization state. + +#### Scenario: Existing config with missing DB is recovered +- **WHEN** `DmlOps.init` runs in a project where `.dml/config.toml` exists and `.dml/db/` does not +- **THEN** initialization creates `.dml/db/` and completes without requiring manual repository repair + +### Requirement: Recovery mode pulls when a project URI is configured +The system SHALL perform project bootstrap pull during recovery when resolved configuration includes `remote.project`. + +#### Scenario: Recovery triggers pull when project URI is present +- **WHEN** `DmlOps.init` recovers a missing DB and resolved config includes `remote.project` +- **THEN** it runs pull using the resolved project and remote configuration to populate local repository state + +#### Scenario: Recovery skips pull when project URI is absent +- **WHEN** `DmlOps.init` recovers a missing DB and resolved config has no `remote.project` +- **THEN** it creates local DB state without invoking pull diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..66eb083 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/git-like-commit-ops/spec.md @@ -0,0 +1,24 @@ +## MODIFIED Requirements + +### Requirement: Git-like project workflows are owned by DmlOps orchestration +Git-like project command workflows SHALL execute through `DmlOps` orchestration methods that coordinate commit and remote operations without requiring CLI-owned business logic. + +#### Scenario: Pull executes through DmlOps workflow +- **WHEN** a caller invokes project pull with remote target, head ref, and user context +- **THEN** `DmlOps` resolves project context, performs remote synchronization, and applies merge behavior through internal ops + +#### Scenario: Push executes through DmlOps workflow +- **WHEN** a caller invokes project push with remote target and push options +- **THEN** `DmlOps` performs project-aware remote push behavior and returns the push result without CLI-managed remote orchestration + +#### Scenario: Revert executes through DmlOps workflow +- **WHEN** a caller invokes project revert with revision, head ref, and user context +- **THEN** `DmlOps` resolves the revision and performs revert behavior through internal commit operations + +#### Scenario: Init runs as in-place project setup +- **WHEN** a caller invokes `DmlOps.init` +- **THEN** it initializes project state under `.dml/` in the current location instead of creating a separate project directory + +#### Scenario: Init recovers config-first partial state +- **WHEN** `.dml/config.toml` exists but `.dml/db/` is missing at init time +- **THEN** `DmlOps.init` creates the missing DB state and continues bootstrap behavior based on resolved configuration diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/required-remote-config/spec.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/required-remote-config/spec.md new file mode 100644 index 0000000..52f9413 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/required-remote-config/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST NOT model remote configuration as optional, MUST NOT provide `None` defaults for required remote parameters, and MUST receive normalized `remote.uri` configuration from the shared internal configuration resolver rather than reading raw environment variables or project config files themselves. + +#### Scenario: Remote-aware ops constructor requires remote URI +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete normalized remote URI argument rather than an optional remote parameter + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +#### Scenario: Remote-aware component does not resolve env vars directly +- **WHEN** a remote-aware runtime or ops component is used in a remote-backed flow +- **THEN** it receives already-resolved remote configuration from its caller instead of inspecting `DML_REMOTE`, older remote env-var forms, or project config files directly + +#### Scenario: Init fails when required remote URI cannot resolve validly +- **WHEN** `DmlOps.init` requires remote-backed bootstrap behavior and shared config resolution does not produce a valid `remote.uri` +- **THEN** init fails with a configuration error instead of proceeding with unresolved or implicit remote configuration diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..684aa3d --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/specs/shared-internal-configuration/spec.md @@ -0,0 +1,35 @@ +## MODIFIED Requirements + +### Requirement: Multiple config sources normalize into the shared internal model +The system SHALL treat explicit arguments, environment variables, project-local config, and global config as sources that feed the shared internal configuration model. Source-specific loading may differ, but normalization and precedence MUST be centralized in the shared internal resolver. + +#### Scenario: Project-local and global config feed shared resolution +- **WHEN** a frontend resolves configuration for an operation in a project directory +- **THEN** project-local `.dml/config.toml` and any applicable global config inputs are loaded as sources for the same shared internal resolution path + +#### Scenario: Environment values are normalized centrally +- **WHEN** configuration is resolved from environment variables +- **THEN** the shared internal resolver, not the frontend, maps those values into the canonical internal configuration model + +#### Scenario: Init resolves explicit options through shared resolver +- **WHEN** a caller provides init-time options for project/runtime configuration +- **THEN** `DmlOps.init` resolves them through the shared internal resolver before mutating project state + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize `remote.project` so that resolved project configuration always includes a branch and never a tag. The resolved config object SHALL expose a `project.branch` helper derived from the normalized URI. + +#### Scenario: Missing branch normalizes from default branch +- **WHEN** `remote.project` is provided without a branch in `project/runtime` scope +- **THEN** the resolver appends the effective default branch to the normalized `remote.project` + +#### Scenario: Tag URI is rejected for project context +- **WHEN** `remote.project` is provided with a tag selector +- **THEN** project configuration resolution fails because active project context must target a branch, not an immutable tag + +#### Scenario: Project branch helper is derived from normalized URI +- **WHEN** resolved configuration includes `remote.project` +- **THEN** `project.branch` returns the branch encoded in the normalized URI rather than reading a standalone branch config parameter + +#### Scenario: Init fails when required project URI cannot resolve validly +- **WHEN** init flow requires `remote.project` for bootstrap behavior but resolver output leaves it invalid or unresolved +- **THEN** `DmlOps.init` fails before creating or mutating repository state diff --git a/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/tasks.md b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/tasks.md new file mode 100644 index 0000000..e02c0a0 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-refine-dmlops-init-config-resolution/tasks.md @@ -0,0 +1,17 @@ +## 1. Align init contract and config resolution + +- [x] 1.1 Update `DmlOps.init` inputs/signature to remove directory-placement (`here`) behavior and define in-place `.dml/` initialization contract. +- [x] 1.2 Route init-time options through the shared internal config resolver and validate resolved canonical fields before filesystem mutation. +- [x] 1.3 Ensure required init-time values fail fast with explicit errors when unresolved/invalid (including `remote.project` and `remote.uri` when required by bootstrap flow). + +## 2. Implement init recovery bootstrap behavior + +- [x] 2.1 Add initialization path for `.dml/config.toml` present + `.dml/db/` missing that creates the missing DB state idempotently. +- [x] 2.2 Trigger pull during recovery when resolved config includes `remote.project`, using resolved remote/project context. +- [x] 2.3 Ensure recovery path skips pull when no `remote.project` is configured and still completes local init successfully. + +## 3. Update callers, docs, and test coverage + +- [x] 3.1 Update CLI/API entrypoints and help/error text to match in-place init semantics and resolver-validated options. +- [x] 3.2 Add/adjust tests for local-only init location semantics, strict config validation failures, and recovery-mode pull/no-pull branches. +- [x] 3.3 Add regression tests for repeat init idempotency across clean, fully initialized, and config-only partial states. diff --git a/openspec/changes/archive/2026-04-30-remove-dmlops-clone/.openspec.yaml b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-30-remove-dmlops-clone/design.md b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/design.md new file mode 100644 index 0000000..acb2ec1 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/design.md @@ -0,0 +1,56 @@ +## Context + +Clone behavior currently spans CLI parsing, `DmlOps` workflow composition, and lower-level fetch/checkout operations. The `DmlOps.clone` layer is no longer providing unique behavior and instead proxies or re-composes logic that already exists elsewhere, which increases indirection and test maintenance. This change removes that layer while preserving observable clone behavior defined by existing specs. + +Constraints: +- No backward compatibility shim for `DmlOps.clone`. +- Clone semantics must remain aligned with existing remote/fetch/checkout requirements. +- Related dead code should be removed in the same change to avoid partial cleanup. + +## Goals / Non-Goals + +**Goals:** +- Remove `DmlOps.clone` in all forms and all call sites. +- Route clone execution through supported internal operations directly. +- Keep user-facing clone behavior stable where requirements are unchanged. +- Reduce maintenance overhead by deleting clone-specific wrappers/helpers that become unused. + +**Non-Goals:** +- Changing remote protocol semantics. +- Introducing new clone features. +- Preserving internal compatibility for code that imports or calls `DmlOps.clone`. + +## Decisions + +- Remove `DmlOps.clone` methods and update CLI routing to invoke the surviving operation path directly. + - Rationale: keeps one authoritative clone composition path and removes duplicate orchestration logic. + - Alternative considered: keep `DmlOps.clone` as a thin forwarding wrapper; rejected because it preserves dead abstraction and violates the no-shims requirement. + +- Keep existing clone behavior assertions at CLI/operation boundaries, but relocate tests away from `DmlOps.clone` targets. + - Rationale: protects behavior while allowing internal refactor/removal. + - Alternative considered: broad test rewrite from scratch; rejected as unnecessary risk and effort. + +- Remove clone-only helpers that become unreachable after method removal. + - Rationale: avoid latent dead code and future confusion. + - Alternative considered: defer cleanup to follow-up PR; rejected because dead code is directly caused by this change and should be removed atomically. + +## Risks / Trade-offs + +- [Risk] Hidden callers depend on `DmlOps.clone` internally. → Mitigation: run repository-wide reference search and update all call sites in the same change. +- [Risk] Behavioral regressions from routing updates. → Mitigation: retain/adjust existing clone integration tests and add targeted regression coverage where the call path changes. +- [Risk] Over-deletion of shared helpers. → Mitigation: remove only helpers proven clone-exclusive by static references and test coverage. + +## Migration Plan + +1. Remove `DmlOps.clone` definitions and exports. +2. Rewire CLI clone path to use direct operation composition. +3. Delete dead clone-specific helpers and obsolete tests. +4. Update and run clone-related unit/integration tests. +5. Update docs/comments that reference `DmlOps.clone`. + +Rollback strategy: +- Revert this change as a unit if regressions are found, since no data migration is involved and the change is code-path/abstraction removal. + +## Open Questions + +- None currently; proceed with implementation based on existing clone/fetch/checkout requirements. diff --git a/openspec/changes/archive/2026-04-30-remove-dmlops-clone/proposal.md b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/proposal.md new file mode 100644 index 0000000..01950b3 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/proposal.md @@ -0,0 +1,25 @@ +## Why + +`DmlOps.clone` duplicates clone workflow composition that already exists in lower-level operations and creates an extra maintenance surface with little value. Removing it now simplifies the architecture before more clone/fetch/checkout changes land. + +## What Changes + +- Remove `DmlOps.clone` entrypoints in all forms (including sync/async/wrapper variants) and eliminate all direct call paths. +- Rewire clone command handling to use surviving internal operations directly, without compatibility shims. +- Delete dead code, helpers, and tests that exist only to support `DmlOps.clone`. +- Update tests and docs to reflect the new routing path and removed internal API. +- **BREAKING**: internal `DmlOps.clone` API is removed with no backward compatibility layer. + +## Capabilities + +### New Capabilities +- None. + +### Modified Capabilities +- `thin-cli-routing`: clone CLI routing no longer delegates through a `DmlOps.clone` workflow method and instead composes clone behavior through supported internal operations. + +## Impact + +- Affected code: clone-related CLI handlers, `DmlOps` class methods, clone workflow helpers, and associated tests. +- Affected APIs: internal Python API surface that referenced `DmlOps.clone`. +- Dependencies/systems: no external dependency additions; behavior remains aligned with existing fetch/checkout and remote project semantics. diff --git a/openspec/changes/archive/2026-04-30-remove-dmlops-clone/specs/thin-cli-routing/spec.md b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/specs/thin-cli-routing/spec.md new file mode 100644 index 0000000..b70e102 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/specs/thin-cli-routing/spec.md @@ -0,0 +1,31 @@ +## MODIFIED Requirements + +### Requirement: CLI project commands delegate to a single DmlOps method +The `dml` CLI project command handlers SHALL remain thin adapters that parse command arguments and invoke exactly one workflow entrypoint per command path. + +#### Scenario: Fetch delegates through DmlOps +- **WHEN** a user runs `dml fetch [branch]` +- **THEN** the CLI handler parses inputs and calls one `DmlOps` fetch workflow method that performs remote synchronization behavior + +#### Scenario: Checkout delegates through DmlOps +- **WHEN** a user runs `dml checkout ` +- **THEN** the CLI handler parses the revision and calls one `DmlOps` checkout workflow method that returns attached/detached result details + +#### Scenario: Merge delegates through DmlOps +- **WHEN** a user runs `dml merge --head --user ` +- **THEN** the CLI handler calls one `DmlOps` merge workflow method and does not instantiate commit/remote ops directly + +#### Scenario: Clone delegates through internal operations entrypoint +- **WHEN** a user runs `dml clone [options]` +- **THEN** the CLI handler parses inputs and calls one supported internal operations entrypoint for clone orchestration without invoking `DmlOps.clone` + +### Requirement: Clone command composes via DmlOps workflow +The clone CLI entrypoint SHALL delegate clone workflow composition through supported internal operations after input parsing and command-level validation. + +#### Scenario: Clone branch flow delegation +- **WHEN** a user runs `dml clone dml://alice/demo#main --bucket my-bucket` +- **THEN** the CLI entrypoint delegates to one internal clone orchestration path that performs fetch and checkout composition and returns clone result metadata without `DmlOps.clone` + +#### Scenario: Clone tag flow delegation +- **WHEN** a user runs `dml clone dml://alice/demo@v1.0 --bucket my-bucket` +- **THEN** the CLI entrypoint delegates to one internal clone orchestration path that performs fetch and detached checkout semantics through internal ops without `DmlOps.clone` diff --git a/openspec/changes/archive/2026-04-30-remove-dmlops-clone/tasks.md b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/tasks.md new file mode 100644 index 0000000..9c894f5 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-remove-dmlops-clone/tasks.md @@ -0,0 +1,17 @@ +## 1. Remove DmlOps clone surface + +- [x] 1.1 Locate and delete `DmlOps.clone` implementations in all forms (sync/async/wrapper variants) and remove related exports. +- [x] 1.2 Update internal callers to use the surviving clone orchestration entrypoint directly, with no compatibility shim. +- [x] 1.3 Remove clone-only helpers that become unreachable after `DmlOps.clone` removal. + +## 2. Rewire CLI clone routing + +- [x] 2.1 Update clone CLI command handling to invoke one supported internal operations entrypoint directly after argument parsing. +- [x] 2.2 Ensure clone branch/tag flows preserve existing fetch/checkout semantics while avoiding `DmlOps.clone`. +- [x] 2.3 Update CLI-facing result mapping and error propagation for the new route. + +## 3. Clean up and verify behavior + +- [x] 3.1 Remove or rewrite tests that target `DmlOps.clone`, keeping clone behavior coverage at CLI/operation boundaries. +- [x] 3.2 Run clone-related unit/integration tests and fix regressions from the routing change. +- [x] 3.3 Update affected docs/comments to remove references to `DmlOps.clone` and reflect direct operation routing. diff --git a/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/.openspec.yaml b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/.openspec.yaml new file mode 100644 index 0000000..12e66c2 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-30 diff --git a/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/design.md b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/design.md new file mode 100644 index 0000000..ac1e1ee --- /dev/null +++ b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/design.md @@ -0,0 +1,66 @@ +## Context + +The CLI currently acts as both transport layer and workflow coordinator in places, blending input/output concerns with domain decision-making. This creates duplicated branching, makes command behavior harder to validate in isolation, and increases the risk of interface regressions when internal logic evolves. The repository already has layered boundaries (CLI -> API/internal ops), so this change formalizes that boundary for maintainability and testability. + +## Goals / Non-Goals + +**Goals:** +- Ensure CLI command handlers only perform argument parsing, call into domain interfaces, and serialize results/errors. +- Move non-transport decision logic out of CLI modules into API/internal layers with explicit contracts. +- Keep user-visible CLI semantics and output shape stable unless a compatibility fix is explicitly required. +- Make behavior testable at the correct layer (domain behavior tested outside CLI; CLI tests focused on parsing and formatting). + +**Non-Goals:** +- Redesigning command names, flags, or broad UX flows. +- Rewriting underlying domain behavior unrelated to boundary extraction. +- Introducing a new CLI framework. + +## Decisions + +### Decision: Define CLI as a thin interface boundary +The CLI will be treated as a transport adapter with three responsibilities: parse inputs, invoke one domain entrypoint, serialize outputs. + +Alternatives considered: +- Keep selective orchestration in CLI for convenience: rejected because boundary remains ambiguous and hard to enforce. +- Push all behavior into CLI-specific helper utilities: rejected because it only relocates, not resolves, layering concerns. + +### Decision: Move branching/workflow rules to API or internal ops based on ownership +If logic expresses user-level behavior contract, place it in public API modules; if it reflects transactional/domain primitives, place it in internal ops. + +Alternatives considered: +- Move all logic directly to internal ops: rejected because API-level semantics and ergonomics still need a stable home. + +### Decision: Standardize command result envelopes before formatting +Command handlers should consume structured domain results and apply consistent output serialization paths (success, validation failure, execution failure). + +Alternatives considered: +- Keep per-command ad hoc output shaping: rejected due to inconsistency and duplicated error translation. + +### Decision: Enforce boundary with tests and code review checks +Update tests so CLI tests assert parsing/serialization only, while behavior tests move to API/internal suites. Add review guidance to prevent reintroducing orchestration logic into CLI paths. + +Alternatives considered: +- Rely on convention without tests: rejected because drift is likely over time. + +## Risks / Trade-offs + +- [Risk] Extracting logic may accidentally change edge-case command behavior -> Mitigation: capture baseline behavior with regression tests before and after extraction. +- [Risk] Refactor can temporarily duplicate logic across layers -> Mitigation: perform iterative moves per command area with cleanup checkpoints. +- [Risk] Error mapping changes may alter exit codes/message text -> Mitigation: preserve and assert current externally visible error contract in CLI-focused tests. +- [Trade-off] More explicit interfaces between CLI and domain layers increase initial verbosity -> Mitigation: gain long-term clarity and lower maintenance overhead. + +## Migration Plan + +1. Inventory CLI commands and identify non-transport logic currently in handlers. +2. Define/confirm target domain entrypoints for each command area. +3. Extract one command area at a time, preserving existing output and exit code behavior. +4. Move/add tests to validate behavior at domain layers and keep CLI tests transport-focused. +5. Remove dead CLI branches/helpers once each area is migrated. + +Rollback strategy: +- Revert per-command extraction commits if contract regressions are discovered, then re-apply with stronger regression coverage. + +## Open Questions + +- Should a lightweight lint/check be added to flag disallowed imports or patterns in `src/daggerml/_cli/**`? +- Are there any intentionally CLI-only behaviors that should remain exceptions to the thin-boundary rule? diff --git a/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/proposal.md b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/proposal.md new file mode 100644 index 0000000..4100fe9 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/proposal.md @@ -0,0 +1,26 @@ +## Why + +The current CLI layer appears to include behavior beyond argument parsing and output serialization, which makes it harder to test, reason about, and evolve interface contracts safely. We need a clear boundary now to improve maintainability and keep business rules centralized in core modules. + +## What Changes + +- Refactor the CLI surface so command handlers only parse inputs, invoke domain APIs, and serialize outputs. +- Move decision-making and workflow logic currently in CLI command paths into appropriate internal/public API layers. +- Standardize CLI command result shaping so output formatting is consistent and transport-focused. +- Remove or simplify CLI-only branching that duplicates domain behavior. +- Preserve existing user-visible command semantics unless a compatibility adjustment is explicitly required. + +## Capabilities + +### New Capabilities +- `cli-thin-interface`: Define and enforce CLI responsibility boundaries for input parsing and output serialization only. + +### Modified Capabilities + + +## Impact + +- Affected code: `src/daggerml/_cli/**` and any modules currently called from CLI that will absorb moved logic. +- APIs: CLI command internals and invocation paths; no intentional public CLI UX breakage. +- Tests: CLI tests and potentially API/internal tests updated to assert shifted responsibility. +- Systems: Improves separation of concerns between interface and domain layers, reducing duplicate logic. diff --git a/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/specs/cli-thin-interface/spec.md b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/specs/cli-thin-interface/spec.md new file mode 100644 index 0000000..ca33e2f --- /dev/null +++ b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/specs/cli-thin-interface/spec.md @@ -0,0 +1,29 @@ +## ADDED Requirements + +### Requirement: CLI handlers are transport-only +The CLI command layer SHALL be limited to parsing command inputs, invoking domain interfaces, and serializing outputs, and SHALL NOT contain business workflow or domain decision logic. + +#### Scenario: CLI parses and delegates +- **WHEN** a user invokes any CLI command +- **THEN** the handler parses flags/arguments, calls a domain entrypoint, and formats the returned result without domain branching in the CLI layer + +### Requirement: Domain logic resides outside CLI modules +Any behavior that determines domain outcomes (state transitions, merge/reconcile rules, execution sequencing, or validation beyond input shape/type checks) MUST execute in API/internal modules rather than `src/daggerml/_cli/**`. + +#### Scenario: Decision logic extraction +- **WHEN** a command path requires branching based on repository or execution state +- **THEN** the branching logic executes in a non-CLI module and CLI code only forwards parsed inputs and surfaces returned outcomes + +### Requirement: CLI output contract remains stable through refactor +Refactoring to enforce a thin CLI boundary MUST preserve existing user-visible command semantics, including success output structure and failure signaling, unless a change is explicitly documented as a compatibility update. + +#### Scenario: Refactor preserves command behavior +- **WHEN** CLI logic is moved into domain modules +- **THEN** command outputs and exit outcomes remain equivalent for existing supported invocations + +### Requirement: CLI tests focus on interface behavior +CLI-focused tests SHALL validate input parsing, delegation wiring, output serialization, and exit signaling, while domain behavior assertions SHALL be covered in non-CLI test suites. + +#### Scenario: Test responsibility split +- **WHEN** adding or updating tests for a refactored command +- **THEN** CLI tests assert transport concerns only and domain behavior checks appear in API/internal tests diff --git a/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/tasks.md b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/tasks.md new file mode 100644 index 0000000..41db747 --- /dev/null +++ b/openspec/changes/archive/2026-04-30-simplify-cli-io-boundary/tasks.md @@ -0,0 +1,23 @@ +## 1. Baseline and boundary definition + +- [x] 1.1 Inventory `src/daggerml/_cli/**` command handlers and list locations where domain/workflow logic currently exists. +- [x] 1.2 Define per-command domain entrypoints (API or internal ops) that CLI handlers should delegate to. +- [x] 1.3 Capture baseline CLI behavior for critical commands (output shape, exit outcomes, key error cases) with regression tests. + +## 2. Extract CLI orchestration logic + +- [x] 2.1 Refactor command handlers to keep only input parsing, delegation, and output serialization. +- [x] 2.2 Move extracted branching/workflow decisions into appropriate API/internal modules with explicit interfaces. +- [x] 2.3 Remove CLI-local duplicated decision branches once equivalent domain logic paths are validated. + +## 3. Normalize output and error handling + +- [x] 3.1 Introduce or align structured command result envelopes used by CLI formatters. +- [x] 3.2 Ensure CLI preserves existing externally visible success output and failure signaling semantics. +- [x] 3.3 Add/update tests for consistent serialization and exit signaling across representative commands. + +## 4. Final verification and guardrails + +- [x] 4.1 Shift behavior-heavy assertions to API/internal tests and keep CLI tests focused on transport concerns. +- [x] 4.2 Add lightweight guardrails (review checklist or lint/test pattern) to prevent new business logic from being added in CLI modules. +- [x] 4.3 Run targeted test suites for CLI and touched domain layers, and fix any regressions before merge. diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/.openspec.yaml b/openspec/changes/archive/2026-05-01-centralize-revision-uri/.openspec.yaml new file mode 100644 index 0000000..ce9d1c6 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-01 diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/design.md b/openspec/changes/archive/2026-05-01-centralize-revision-uri/design.md new file mode 100644 index 0000000..af27ea8 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/design.md @@ -0,0 +1,70 @@ +## Context + +The codebase currently has two URI models and two parse/canonicalization implementations: + +- `parse_dml_project_uri` / `normalize_project_uri` in `_internal.config` +- `parse_dml_uri` / `canonical_dml_uri` in `_internal.ops.remote` + +Both encode the same grammar (`dml://owner/project#branch` or `@tag`) and many of the same validations. This creates avoidable duplication and policy drift. + +At the same time, operation constraints differ by context: + +- configuration should accept branch or tag +- branch push requires branch +- tag push requires tag + +Those operation constraints should be applied at call boundaries, not baked into the shared parser. + +## Goals / Non-Goals + +**Goals:** +- Establish one shared revision URI model and parse/stringify surface. +- Ensure canonical stringification always emits branch or tag form. +- Allow `remote.project` to carry either a branch or a tag. +- Keep mutation restrictions explicit in operation methods. + +**Non-Goals:** +- Redesigning commit revision expression grammar (`HEAD`, `~N`, `origin/main`). +- Changing remote ref namespace/layout. +- Changing branch/tag mutability semantics. + +## Decisions + +- Single shared revision URI value type with XOR invariant. + - Decision: represent revision URI as `RevisionUri(owner, project, branch, tag)` with exactly one non-`None` among `branch`, `tag`. + - Rationale: type-level explicitness prevents ambiguous states and simplifies canonicalization. + +- Central parser returns fully realized revision selectors. + - Decision: parser validates URI structure/segments and invariant requirements, and resolves missing selector to a branch via provided default-branch input so parsed `RevisionUri` always has exactly one selector set. + - Rationale: parsing is the single realization boundary; downstream code receives a complete typed revision object. + +- Central stringifier is canonical and total for valid `RevisionUri` values. + - Decision: stringifier always emits canonical `dml://owner/project#branch` or `dml://owner/project@tag`. + - Rationale: one path for all generated URI text removes ad-hoc interpolation drift. + +- Canonicalize helper composes parse + stringify. + - Decision: provide explicit helper for canonical URI normalization. + - Rationale: most call sites want normalization without manual two-step calls. + +- Operation-level constraints remain where behavior differs. + - Decision: keep branch/tag requirements at operation boundaries (e.g., push branch/tag methods). + - Rationale: this preserves existing behavior contracts while enabling broader URI acceptance in configuration. + +## Migration Plan + +1. Add shared `RevisionUri` parse/stringify/canonicalize utilities. +2. Convert existing config and remote helper APIs into wrappers over shared utilities. +3. Migrate URI assembly call sites in `DmlOps` and commit tracking URI construction to shared stringifier. +4. Remove branch-only rejections in config resolution and project config loading. +5. Keep or strengthen explicit branch/tag assertions in mutating remote ops. +6. Update tests/spec expectations for tag-accepting project URIs and centralized helper behavior. + +## Risks / Trade-offs + +- [Backward compatibility surprises in config flows] -> Mitigation: preserve canonical string output and migrate with wrapper compatibility first. +- [Over-centralizing policy] -> Mitigation: keep parser policy-neutral about selector-type capabilities and enforce behavior-specific branch/tag rules at operation boundaries. +- [Test churn] -> Mitigation: sequence migration via wrappers to keep external behavior stable while internals consolidate. + +## Open Questions + +- Should `DmlProjectConfig` become revision-shaped (branch/tag) or remain branch-oriented with conversion behavior at load/save boundaries? diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/proposal.md b/openspec/changes/archive/2026-05-01-centralize-revision-uri/proposal.md new file mode 100644 index 0000000..3f31ab8 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/proposal.md @@ -0,0 +1,25 @@ +## Why + +Revision URI parsing and canonicalization are currently duplicated across configuration and remote ops layers with near-identical rules and slightly different behavior boundaries. That duplication increases drift risk, makes policy changes expensive (like allowing `remote.project` tags), and creates unnecessary coupling between caller intent and URI representation details. + +We want one canonical revision URI model and one canonical set of parse/stringify operations, with operation-specific constraints enforced at operation boundaries rather than inside shared parsing logic. + +## What Changes + +- Introduce one shared revision URI value type (owner, project, branch, tag) with invariant: exactly one of branch/tag is present. +- Introduce one shared parser for revision URIs that returns a fully realized selector (branch or tag), one shared canonical stringifier, and one canonicalize helper (`parse + stringify`). +- Replace duplicated URI parsing/canonicalization implementations in config and remote ops with wrappers/delegation to the shared implementation. +- Update project configuration semantics to allow tag-bearing `remote.project` values. +- Preserve branch-only mutation safety by keeping operation-level branch/tag requirements in mutating remote methods. + +## Capabilities + +### Modified Capabilities +- `shared-internal-configuration`: project URI normalization/parsing semantics move to centralized shared revision URI utilities and no longer reject tags at config resolution time. +- `remote-project-refs`: remote URI parsing and canonicalization use one centralized implementation while preserving mutable-branch/immutable-tag operation contracts. + +## Impact + +- Affected code: shared internal config URI helpers, remote ops URI helpers, DmlOps URI assembly sites, and commit revision tracking URI construction. +- Affected behavior: `remote.project` accepts branch or tag selectors; mutation restrictions remain enforced by branch/tag-specific operations. +- User-facing impact: canonical URI behavior stays stable while enabling tag-based project URI configuration and reducing inconsistent URI handling edge cases. diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/remote-project-refs/spec.md b/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..30349fc --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/remote-project-refs/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Remote operations parse DML URIs +The system SHALL parse and canonicalize DML revision URIs through one centralized shared revision URI parser/stringifier boundary before deriving remote project ref paths. + +#### Scenario: Push parses branch URI through shared parser +- **WHEN** push targets canonical URI `dml://alice/demo#main` +- **THEN** remote operations derive `refs/projects/alice/demo/heads/main.json` from the shared parsed revision object + +#### Scenario: Fetch parses tag URI through shared parser +- **WHEN** fetch targets canonical URI `dml://alice/demo@v1.0` +- **THEN** remote operations derive `refs/projects/alice/demo/tags/v1.0.json` from the shared parsed revision object + +#### Scenario: Branch/tag capability checks remain operation-specific +- **WHEN** a mutation operation targets the wrong selector type (branch op with tag URI, or tag op with branch URI) +- **THEN** the operation fails at method boundary capability checks even though URI parsing/canonicalization succeeds diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..acceb42 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/specs/shared-internal-configuration/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize and canonicalize `remote.project` through shared revision URI utilities. Resolved project configuration MAY target a branch or a tag. The resolved config object SHALL continue to expose helper accessors for the effective project selector. + +#### Scenario: Missing selector parses as default branch +- **WHEN** `remote.project` is provided without a branch or tag in `project/runtime` scope +- **THEN** shared revision URI parsing resolves it to a fully realized branch selector using the effective default branch + +#### Scenario: Tag URI is accepted for project context +- **WHEN** `remote.project` is provided with a tag selector +- **THEN** project configuration resolution succeeds and preserves canonical tag form + +#### Scenario: Project helper accessors derive from canonical URI +- **WHEN** resolved configuration includes `remote.project` +- **THEN** helper accessors derive selector values from canonical parsed URI rather than standalone duplicated parsing logic diff --git a/openspec/changes/archive/2026-05-01-centralize-revision-uri/tasks.md b/openspec/changes/archive/2026-05-01-centralize-revision-uri/tasks.md new file mode 100644 index 0000000..ab517e9 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-centralize-revision-uri/tasks.md @@ -0,0 +1,25 @@ +## 1. Establish shared revision URI boundary + +- [x] 1.1 Add shared `RevisionUri` value type with XOR invariant (`branch` xor `tag`). +- [x] 1.2 Add shared parse utility for `dml://owner/project#branch|@tag` that returns fully realized `RevisionUri` (injecting default branch when selector omitted). +- [x] 1.3 Add shared stringify utility that emits canonical branch/tag URI form. +- [x] 1.4 Add shared canonicalize helper (`parse + stringify`) and apply 64-byte canonical URI validation in one place. + +## 2. Migrate existing helpers and call sites + +- [x] 2.1 Convert `_internal.config` URI helpers to delegate to shared revision URI utilities. +- [x] 2.2 Convert `RemoteOps` URI helpers to delegate to shared revision URI utilities. +- [x] 2.3 Replace ad-hoc URI interpolation in `DmlOps` project remote URI construction with shared stringify. +- [x] 2.4 Replace commit tracking URI interpolation with shared stringify where DML URI tracking heads are created/looked up. + +## 3. Align policy semantics (tags allowed in project URI) + +- [x] 3.1 Remove config-layer branch-only rejection for `remote.project`. +- [x] 3.2 Keep operation-level branch/tag capability checks (e.g., push-branch requires branch; push-tag requires tag). +- [x] 3.3 Review and update `DmlProjectConfig` behavior to support tag-bearing project URI usage without breaking branch mutation flows. + +## 4. Validate behavior and documentation + +- [x] 4.1 Update/add tests for centralized parse/stringify/canonicalize behavior and wrapper compatibility. +- [x] 4.2 Update/add tests proving `remote.project` accepts tags while mutation constraints remain enforced by operation methods. +- [x] 4.3 Update relevant OpenSpec spec deltas (`shared-internal-configuration`, `remote-project-refs`) to reflect centralized URI handling and tag-allowed project URI semantics. diff --git a/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/design.md b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/design.md new file mode 100644 index 0000000..ae94e75 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/design.md @@ -0,0 +1,122 @@ +## Context + +The repository currently has broad test coverage but inconsistent structure: many tests are organized by module history instead of contract boundary, lifecycle assertions are often split across multiple test functions, and integration-heavy tests are not consistently marked for selective execution. This makes the suite harder to navigate, raises maintenance cost, and weakens traceability from documented requirements to test intent. + +The change introduces a full-suite migration to a contract matrix approach: + +- tests organized around contract surfaces, +- canonical contract IDs embedded directly in test naming and parameterized case IDs, +- lifecycle stage coverage expressed via parameterization, +- integration tests consistently marked `slow`, +- legacy superseded tests removed. + +Pressure-test findings from the current repository state: + +- There are currently no tests marked `@pytest.mark.slow`. +- CI currently runs full unfiltered test passes (`pytest .`) in both standard and sanitizer jobs. +- Autouse fixture coupling is present in top-level and internal test conftest modules, which can blur contract vs integration boundaries during migration. +- Lifecycle helper logic (for example argv manifest setup and poll-until-terminal loops) is duplicated across multiple contrib suites and should be consolidated as part of lifecycle matrix migration. + +## Goals / Non-Goals + +**Goals:** +- Make each maintained test target a single documented contract or invariant. +- Improve traceability from docs/specs to test failures using stable canonical IDs. +- Reduce lifecycle test duplication via parameterized stage matrices. +- Make quick local feedback dependable with `-m "not slow"`. +- Complete migration of maintained tests to the new setup and eliminate legacy duplicates. + +**Non-Goals:** +- Changing product runtime behavior or public API semantics. +- Introducing a centralized contract-ID registry module. +- Rewriting every helper fixture unless needed for speed/clarity boundaries. + +## Decisions + +1. Test taxonomy is contract-first with explicit fast/integration split. + - Decision: Use `tests/contracts/` for fast invariant checks and `tests/integration/` for multi-component/infrastructure tests. + - Rationale: Improves discoverability and makes speed characteristics obvious from location. + - Alternative considered: Keep current folders and annotate intent with comments/markers only. Rejected because structure would remain ambiguous and drift-prone. + +2. Canonical IDs are literal strings at point-of-use. + - Decision: Put IDs directly in test function names and parameterized `id=` strings (for example `EXEC-LC-003:resume-uses-launch-state`). + - Rationale: Preserves readability and avoids indirection overhead while still enabling stable traceability. + - Alternative considered: Central CONTRACT map or registry module. Rejected for now due to maintenance overhead and limited immediate benefit. + +3. Lifecycle assertions are represented as parameterized stage matrices. + - Decision: For lifecycle-heavy surfaces, collapse near-duplicate tests into one parameterized test per contract family. + - Rationale: Reduces duplication and makes missing lifecycle stages visible as absent cases. + - Alternative considered: Keep separate functions per stage. Rejected because behavior drift across stages is harder to detect. + +4. Integration selection is marker-driven and mandatory. + - Decision: Mark integration tests `@pytest.mark.slow` and enforce this in migrated suites. + - Rationale: Aligns with existing pytest marker configuration and contributor workflow for fast iteration. + - Alternative considered: Path-only selection without marker discipline. Rejected because existing paths include mixed-speed tests during migration. + +5. Migration is replacement, not long-term parallel tracks. + - Decision: Remove superseded legacy tests as contract-matrix equivalents land. + - Rationale: Avoids dual maintenance and contradictory assertions across old/new structures. + - Alternative considered: Keep old tests indefinitely for safety. Rejected because it increases noise and slows suite evolution. + +## Risks / Trade-offs + +- [Risk] Migration churn causes temporary CI instability due to file moves/renames. + → Mitigation: Migrate in bounded batches by subsystem with parity checks before deletion. + +- [Risk] Inconsistent canonical ID formatting across contributors. + → Mitigation: Define and enforce formatting in a single taxonomy doc and apply review checks. + +- [Risk] Over-parameterized tests become hard to read. + → Mitigation: Keep one contract family per parameterized test and use explicit readable case IDs. + +- [Risk] Some tests are hard to classify as contract vs integration. + → Mitigation: Default uncertain cases to integration + `slow`, then optimize toward contract tests when isolation is straightforward. + +- [Risk] Fixture refactors may reveal hidden integration coupling in historically fast tests. + → Mitigation: Introduce explicit fixture scopes and remove implicit autouse integration setup where it blurs boundaries. + +## Migration Plan + +1. Publish taxonomy and naming conventions in docs. +2. Create initial contract/integration directory structure. +3. Execute migration in bounded batches with parity checkpoints: + - Batch 1 (low risk): contract-focused suites with minimal infrastructure coupling. + - Batch 2 (medium): lifecycle-heavy local runtime and funkify suites. + - Batch 3 (medium-high): execution-state and internal roundtrip integration-heavy suites. + - Batch 4 (high): ssh-backed and remaining infrastructure-heavy integration suites. +4. Apply `@pytest.mark.slow` to integration suites during migration. +5. Remove superseded legacy tests only after parity evidence is captured in the migration ledger. +6. Update contributor guidance and CI invocation expectations to match marker usage. + +### Initial Migration Ledger (Batch 1) + +Batch 1 target suites and planned contract mapping: + +- `tests/contrib/test_executor_base.py` -> `tests/contracts/contrib/executor/test_executor_base_handle.py` + - `EXB-HDL-001`: start path when `state=None` + - `EXB-HDL-002`: poll path when state exists + - `EXB-HDL-003`: terminal start result passthrough + - `EXB-HDL-004`: mixed state invocations route correctly +- `tests/contrib/test_ssh_executor.py` -> + - `tests/contracts/contrib/executor/test_ssh_resolve_runnable.py` + - `SSH-RES-001`, `SSH-RES-002` + - `tests/contracts/contrib/executor/test_ssh_handle.py` + - `SSH-HDL-001` through `SSH-HDL-005` +- `tests/test_default_runtime.py` -> `tests/contracts/runtime/test_default_runtime_status.py` + - `DRT-STS-001` through `DRT-STS-004` + +Batch 1 parity gate before legacy removal: + +- targeted migrated suites pass, +- `pytest -m "not slow"` pass, +- full `pytest` pass, +- contract mapping and removal decision recorded in ledger. + +Rollback strategy: +- If migration introduces instability, pause after a completed subsystem batch; retain migrated layout and restore confidence by fixing tests, rather than restoring legacy duplicates. + +## Open Questions + +- Should we introduce a custom pytest marker (for example `contract`) in a follow-up to enable direct contract-only selection? +- Do we want an automated check that every `tests/integration/**` test is marked `slow`? +- Should canonical contract IDs be validated via lint rule in a later iteration? diff --git a/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/migration-ledger.md b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/migration-ledger.md new file mode 100644 index 0000000..2c4f37c --- /dev/null +++ b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/migration-ledger.md @@ -0,0 +1,107 @@ +# Test Migration Ledger: Contract Matrix + +## Change + +- OpenSpec change: `refactor-test-contract-matrix` +- Last updated: 2026-05-01 + +## Status Legend + +- `planned` | `in_progress` | `blocked` | `done` + +## Batch Plan + +| Batch | Scope | Primary Files | Risk | Exit Criteria | Status | +|---|---|---|---|---|---| +| 1 | Low-risk contract suites | `tests/contrib/test_executor_base.py`, `tests/contrib/test_ssh_executor.py`, `tests/test_default_runtime.py` | low | Targeted migrated suites pass, `pytest -m "not slow"` pass, full `pytest` pass, mapping recorded | done | +| 2 | Lifecycle-heavy local runtime suites | `tests/contrib/test_local_runtime.py`, `tests/contrib/test_funkify.py` | medium | Stage-matrix parameterization complete, parity checks pass | done | +| 3 | Execution-state and internal integration-heavy suites | `tests/test_exec_state.py`, `tests/_internal/test_integration_roundtrip.py` | medium-high | Contract/integration split complete, parity checks pass | done | +| 4 | Infrastructure-heavy integration suites | `tests/contrib/test_ssh_integration.py` and remaining integration suites | high | Slow-marker compliance complete, parity checks pass | done | + +## 1) Contract Coverage Mapping (Initial Batch 1) + +| Contract ID | Contract Summary | Old Test Location(s) | New Test Location | Test Type | Slow? | Lifecycle Stages Covered | Status | Notes | +|---|---|---|---|---|---|---|---|---| +| EXB-HDL-001 | executor handle calls start when state is null | `tests/contrib/test_executor_base.py` | `tests/contracts/contrib/executor/test_executor_base_handle.py` | contract | no | kickoff | done | Implemented in lifecycle stage matrix test with canonical case ID. | +| EXB-HDL-002 | executor handle calls poll when state exists | `tests/contrib/test_executor_base.py` | `tests/contracts/contrib/executor/test_executor_base_handle.py` | contract | no | resume/poll | done | Implemented in lifecycle stage matrix test with canonical case ID. | +| EXB-HDL-003 | terminal start result is returned directly | `tests/contrib/test_executor_base.py` | `tests/contracts/contrib/executor/test_executor_base_handle.py` | contract | no | kickoff/terminal | done | Dedicated terminal passthrough assertion migrated. | +| EXB-HDL-004 | mixed state invocations route correctly | `tests/contrib/test_executor_base.py` | `tests/contracts/contrib/executor/test_executor_base_handle.py` | contract | no | kickoff + resume/poll | done | Mixed kickoff/resume assertions preserved. | +| SSH-RES-001 | local adapter resolves ssh runnable shape | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_resolve_runnable.py` | contract | no | resolve | done | Canonical ID in test function name. | +| SSH-RES-002 | ssh resolve rejects invalid inputs | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_resolve_runnable.py` | contract | no | resolve | done | Parametrized case IDs carry canonical contract ID. | +| SSH-HDL-001 | ssh handle forwards envelope to transport | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_handle.py` | contract | no | kickoff | done | Included in stage matrix with canonical case ID. | +| SSH-HDL-002 | ssh transport nonzero exits map to failed | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_handle.py` | contract | no | terminal-failed | done | Included in stage matrix with canonical case ID. | +| SSH-HDL-003 | running child result passes through | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_handle.py` | contract | no | resume/poll | done | Included in stage matrix with canonical case ID. | +| SSH-HDL-004 | child failed result is projected unchanged | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_handle.py` | contract | no | terminal-failed | done | Included in stage matrix with canonical case ID. | +| SSH-HDL-005 | runtime state is forwarded to child payload | `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_handle.py` | contract | no | resume/poll | done | Dedicated runtime-state forwarding test migrated. | +| DRT-STS-001 | status reports implicit default creation source | `tests/test_default_runtime.py` | `tests/contracts/runtime/test_default_runtime_status.py` | contract | no | runtime-init/status | done | Canonical ID in test function name. | +| DRT-STS-002 | process default is cached | `tests/test_default_runtime.py` | `tests/contracts/runtime/test_default_runtime_status.py` | contract | no | steady-state | done | Canonical ID in test function name. | +| DRT-STS-003 | scoped default temporarily overrides process default | `tests/test_default_runtime.py` | `tests/contracts/runtime/test_default_runtime_status.py` | contract | no | scoped lifecycle | done | Canonical ID in test function name. | +| DRT-STS-004 | top-level new/load delegates to default runtime | `tests/test_default_runtime.py` | `tests/contracts/runtime/test_default_runtime_status.py` | contract | no | operation dispatch | done | Canonical ID in test function name. | + +## 2) Legacy Removal Plan (Initial Batch 1) + +| Old File | Replacement New File(s) | Parity Evidence Required | Removal PR/Commit | Removed? | Notes | +|---|---|---|---|---|---| +| `tests/contrib/test_executor_base.py` | `tests/contracts/contrib/executor/test_executor_base_handle.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Legacy file removed after parity runs. | +| `tests/contrib/test_ssh_executor.py` | `tests/contracts/contrib/executor/test_ssh_resolve_runnable.py`, `tests/contracts/contrib/executor/test_ssh_handle.py` | targeted migrated suites pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Legacy file removed after parity runs. | +| `tests/test_default_runtime.py` | `tests/contracts/runtime/test_default_runtime_status.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Legacy file removed after parity runs. | +| `tests/contrib/test_local_runtime.py` | `tests/integration/contrib/test_local_runtime_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/contrib/test_funkify.py` | `tests/integration/contrib/test_funkify_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/test_exec_state.py` | `tests/integration/runtime/test_exec_state_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/test_integration_roundtrip.py` | `tests/integration/internal/test_roundtrip_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy with local fixture compatibility shim. | +| `tests/contrib/test_ssh_integration.py` | `tests/integration/contrib/test_ssh_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/contrib/test_s3_store.py` | `tests/integration/contrib/test_s3_store_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/contrib/test_supervisor.py` | `tests/integration/contrib/test_supervisor_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/contrib/test_funks.py` | `tests/integration/contrib/test_funks_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/ops/test_cache.py` | `tests/integration/internal/ops/test_cache_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/ops/test_remote.py` | `tests/integration/internal/ops/test_remote_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/ops/test_dml_project_workflows.py` (integration subset) | `tests/integration/internal/ops/test_dml_project_workflows_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Extracted integration tests; contract-oriented tests remain in legacy file. | +| `tests/_internal/ops/test_commit.py` | `tests/integration/internal/ops/test_commit_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/ops/test_head.py` | `tests/integration/internal/ops/test_head_integration.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Marked slow and moved to integration taxonomy. | +| `tests/_internal/cli/test_init.py` (integration class) | `tests/_internal/cli/test_init.py` | targeted migrated suite pass + `pytest -m "not slow"` pass + full `pytest` pass + mapping complete | local working tree | yes | Integration class `TestInitCLIIntegration` marked slow in place. | + +## 3) Parity Checklist (Per migrated family) + +- [x] Canonical IDs are direct literals in test names/parameterized case IDs. +- [x] Lifecycle coverage is parameterized where the contract family spans stages. +- [x] Targeted migrated suites pass. +- [x] `pytest -m "not slow"` passes. +- [x] Full `pytest` passes. +- [x] Legacy tests removed only after parity evidence is recorded. + +## 4) Decision Log + +- 2026-04-30: Canonical contract IDs are direct literal strings (no shared ID indirection). +- 2026-04-30: Migration policy is full replacement; superseded legacy tests are removed after parity confirmation. +- 2026-04-30: Integration tests are marked `@pytest.mark.slow`. +- 2026-05-01: Batch 1-4 migration executed in this change; contract suites moved to `tests/contracts/`, integration-heavy suites moved to `tests/integration/` with `slow` marker. + +## 5) Parity Evidence Log (Command-Level) + +- 2026-05-01: Targeted migrated suites + - Command: `uv run pytest tests/contracts/contrib/executor/test_executor_base_handle.py tests/contracts/contrib/executor/test_ssh_resolve_runnable.py tests/contracts/contrib/executor/test_ssh_handle.py tests/contracts/runtime/test_default_runtime_status.py` + - Result: `17 passed` +- 2026-05-01: Targeted migrated integration suites + - Command: `uv run pytest tests/integration/contrib/test_local_runtime_integration.py tests/integration/contrib/test_funkify_integration.py tests/integration/contrib/test_s3_store_integration.py tests/integration/contrib/test_supervisor_integration.py tests/integration/contrib/test_funks_integration.py tests/integration/internal/ops/test_cache_integration.py tests/integration/internal/ops/test_remote_integration.py tests/integration/internal/ops/test_dml_project_workflows_integration.py` + - Result: `154 passed, 1 skipped` +- 2026-05-01: Targeted omitted migrated suites + - Command: `uv run pytest tests/integration/contrib/test_ssh_integration.py tests/integration/runtime/test_exec_state_integration.py tests/integration/internal/test_roundtrip_integration.py` + - Result: `34 passed` +- 2026-05-01: Targeted lifecycle matrix updates + - Command: `uv run pytest tests/integration/contrib/test_funkify_integration.py tests/integration/internal/ops/test_dml_project_workflows_integration.py` + - Result: `14 passed` +- 2026-05-01: Targeted remaining integration migration + - Command: `uv run pytest tests/integration/internal/ops/test_commit_integration.py tests/integration/internal/ops/test_head_integration.py tests/_internal/cli/test_init.py::TestInitCLIIntegration` + - Result: `19 passed` +- 2026-05-01: Fast-path parity (latest) + - Command: `uv run pytest -m "not slow"` + - Result: `498 passed, 209 deselected` +- 2026-05-01: Full parity (latest) + - Command: `uv run pytest` + - Result: `706 passed, 1 skipped` +- 2026-05-01: Fast-path parity + - Command: `uv run pytest -m "not slow"` + - Result: `517 passed, 189 deselected` +- 2026-05-01: Full parity + - Command: `uv run pytest` + - Result: `705 passed, 1 skipped` diff --git a/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/proposal.md b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/proposal.md new file mode 100644 index 0000000..7f22c68 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/proposal.md @@ -0,0 +1,25 @@ +## Why + +The current test suite mixes concerns, duplicates lifecycle assertions across many files, and does not consistently distinguish fast contract checks from slower integration behavior. We need a contract-driven structure so each test maps to one documented invariant, lifecycle coverage is concise and systematic, and fast local feedback is reliable. + +## What Changes + +- Reorganize tests into a contract-first taxonomy with dedicated `tests/contracts/` and `tests/integration/` areas. +- Require canonical contract IDs in test names and parameterized case IDs using direct literal strings (no registry indirection). +- Consolidate lifecycle assertions into parameterized tests that cover each lifecycle stage as explicit cases. +- Require `@pytest.mark.slow` for integration tests and other infrastructure-heavy tests so `pytest -m "not slow"` is a dependable quick path. +- Migrate all maintained tests to the new structure and remove superseded legacy tests once parity is confirmed. + +## Capabilities + +### New Capabilities +- `test-contract-matrix`: Defines repository test taxonomy, canonical test ID conventions, lifecycle parameterization rules, slow-marker policy, and full migration/removal expectations for legacy tests. + +### Modified Capabilities +- None. + +## Impact + +- Affected code: test suite layout and naming across `tests/**`, shared test fixtures where needed to separate fast and integration concerns, and contributor-facing test documentation. +- APIs: no runtime or public API behavior change. +- Dependencies/systems: no new runtime dependency; CI and local test invocation patterns rely more explicitly on marker-based selection. diff --git a/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/specs/test-contract-matrix/spec.md b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/specs/test-contract-matrix/spec.md new file mode 100644 index 0000000..3432135 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/specs/test-contract-matrix/spec.md @@ -0,0 +1,71 @@ +## ADDED Requirements + +### Requirement: Contract-first test taxonomy +The repository SHALL organize maintained tests by contract intent with distinct locations for fast invariant checks and integration behavior. + +#### Scenario: Fast contract tests live under contracts taxonomy +- **WHEN** a test verifies a documented contract or invariant in isolation +- **THEN** it is placed under `tests/contracts/` + +#### Scenario: Integration tests live under integration taxonomy +- **WHEN** a test exercises multi-component behavior, external processes, remote roundtrips, or runtime orchestration +- **THEN** it is placed under `tests/integration/` + +### Requirement: Canonical contract IDs are embedded directly in test identifiers +Each maintained contract-focused test SHALL include a canonical contract ID expressed as a direct literal string in test naming surfaces. + +#### Scenario: Parameterized lifecycle case includes canonical ID +- **WHEN** a test case is defined in `pytest.mark.parametrize` +- **THEN** the case `id=` string includes the canonical contract ID followed by a human-readable case label + +#### Scenario: Canonical IDs avoid indirection +- **WHEN** a test references a canonical contract ID +- **THEN** the ID is specified directly in the test or parameterized case definition and does not require a shared ID registry indirection + +### Requirement: Lifecycle coverage uses parameterized stage matrices +Lifecycle-oriented contracts SHALL be tested with parameterized cases that explicitly represent each lifecycle stage. + +#### Scenario: Lifecycle stages are represented as explicit parameterized cases +- **WHEN** a contract family spans kickoff, resume/poll, and terminal behavior +- **THEN** one parameterized test defines stage-specific cases with distinct IDs and assertions for each stage + +#### Scenario: Stage-specific failures identify contract and stage +- **WHEN** a lifecycle parameterized case fails +- **THEN** the failure node identifier includes both canonical contract ID and stage label + +### Requirement: Integration tests are marked slow +Integration tests SHALL be marked `@pytest.mark.slow` so they can be excluded from quick local runs. + +#### Scenario: Integration test carries slow marker +- **WHEN** a test resides in the integration taxonomy or otherwise requires integration-level runtime behavior +- **THEN** the test is marked `@pytest.mark.slow` + +#### Scenario: Fast test selection excludes integration tests +- **WHEN** contributors run `pytest -m "not slow"` +- **THEN** tests marked `slow` are excluded and the remaining suite represents the fast-path contract checks + +### Requirement: Legacy test suite is fully migrated and superseded tests are removed +The repository SHALL complete migration of maintained tests to the contract matrix setup and SHALL remove superseded legacy tests to avoid duplicate maintenance. + +#### Scenario: Superseded legacy tests are removed after parity +- **WHEN** a legacy test's contract coverage is represented by migrated contract-matrix tests +- **THEN** the legacy test is removed from maintained test paths + +#### Scenario: End state contains only maintained tests aligned to taxonomy +- **WHEN** migration is complete +- **THEN** maintained tests conform to taxonomy, canonical ID, lifecycle parameterization, and slow-marker requirements defined in this specification + +### Requirement: Migration ledger governs parity and removal +The repository SHALL track migration progress in a ledger that maps canonical contract IDs from legacy tests to migrated tests and records parity evidence before legacy removal. + +#### Scenario: Batch plan records concrete suite order and risk +- **WHEN** migration planning is established +- **THEN** the ledger records bounded batch order with risk levels and exit criteria for each batch + +#### Scenario: Contract mapping is explicit for each migrated suite +- **WHEN** a suite is selected for migration +- **THEN** the ledger records canonical contract IDs and old/new test file mappings for that suite + +#### Scenario: Legacy test removal requires parity evidence +- **WHEN** a legacy suite is proposed for removal +- **THEN** the ledger includes passing evidence for targeted migrated suites, `pytest -m "not slow"`, and full `pytest` prior to removal diff --git a/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/tasks.md b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/tasks.md new file mode 100644 index 0000000..a0af50f --- /dev/null +++ b/openspec/changes/archive/2026-05-01-refactor-test-contract-matrix/tasks.md @@ -0,0 +1,27 @@ +## 1. Taxonomy and governance baseline + +- [x] 1.1 Add and publish repository test taxonomy guidance covering directories, naming, canonical IDs, lifecycle parameterization, and slow-marker policy. +- [x] 1.2 Add OpenSpec requirement artifacts for the new test-contract-matrix capability. +- [x] 1.3 Create a migration ledger artifact that maps contract IDs, old/new file locations, parity evidence, and legacy removal state. + +## 2. Contract test structure and naming migration + +- [x] 2.1 Create `tests/contracts/` and `tests/integration/` structure and migrate execution/runtime contract suites into the new locations. +- [x] 2.2 Update migrated contract tests to include canonical contract IDs directly in function names and parameterized case IDs. +- [x] 2.3 Refactor lifecycle-heavy suites into parameterized stage-matrix tests that cover kickoff, resume/poll, and terminal outcomes. +- [x] 2.4 Execute Batch 1 migration for `test_executor_base`, `test_ssh_executor`, and `test_default_runtime` using the initial ledger mappings (`EXB-HDL-*`, `SSH-RES-*`, `SSH-HDL-*`, `DRT-STS-*`). + +## 3. Integration classification and marker enforcement + +- [x] 3.1 Mark all migrated integration suites with `@pytest.mark.slow`, including process/remote/polling-heavy coverage. +- [x] 3.2 Ensure quick-run workflow (`pytest -m "not slow"`) remains valid for fast-path contract checks. +- [x] 3.3 Align contributor and CI guidance with marker-based selection policy while preserving full-suite CI coverage. + +## 4. Full-suite migration and legacy removal + +- [x] 4.1 Execute Batch 2 migration for lifecycle-heavy local runtime suites (`test_local_runtime`, `test_funkify`) with stage-matrix coverage. +- [x] 4.2 Execute Batch 3 migration for execution-state and internal integration-heavy suites (`test_exec_state`, `_internal/test_integration_roundtrip`). +- [x] 4.3 Execute Batch 4 migration for infrastructure-heavy integration suites (including `test_ssh_integration`) and remaining integration coverage. +- [x] 4.4 Maintain batch-by-batch parity evidence (targeted suites, `pytest -m "not slow"`, full `pytest`) in the migration ledger before removing each legacy file. +- [x] 4.5 Remove superseded legacy tests after parity validation to avoid duplicate maintenance. +- [x] 4.6 Run full test suite verification and resolve migration regressions before closing the change. diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/.openspec.yaml b/openspec/changes/archive/2026-05-01-trim-redundant-tests/.openspec.yaml new file mode 100644 index 0000000..ce9d1c6 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-01 diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/design.md b/openspec/changes/archive/2026-05-01-trim-redundant-tests/design.md new file mode 100644 index 0000000..0e63824 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/design.md @@ -0,0 +1,74 @@ +## Context + +The internal contract test tree currently mixes three concerns: command wiring, workflow invariants, and parser grammar validation. Multiple files repeat parser-smoke assertions and revision/URI form checks, especially in CLI setup tests and git-like workflow contracts. This redundancy increases churn and broadens failure blast radius when parsing changes. + +The repository's testing taxonomy requires contract-first coverage, lifecycle parameterization, and removal of superseded tests once parity is confirmed. The change must preserve behavior coverage while reducing duplication and making ownership of parsing contracts explicit. + +## Goals / Non-Goals + +**Goals:** +- Centralize revision/ref/URI parsing behavior into one parameterized contract matrix with canonical IDs. +- Prune redundant parser smoke tests that do not add unique invariants. +- Reclassify external-process orchestration tests as `slow` where taxonomy and runtime behavior indicate integration-level execution. +- Reduce fast-path runtime by collapsing duplicate expensive adapter-path tests into one parameterized contract matrix per behavior family. +- Keep workflow tests focused on operational invariants and delegation boundaries. +- Maintain fast contract-suite ergonomics and traceable migration decisions. + +**Non-Goals:** +- Changing runtime parsing behavior in production code. +- Reorganizing integration test layout or slow-marker policy. +- Broadly renaming unrelated tests for style-only reasons. + +## Decisions + +1. **Introduce a single parsing contract owner suite** + - Create one contract-focused suite for revision/ref/URI parsing forms and errors. + - Use parameterized case matrices to encode form variants and failure boundaries. + - Rationale: this reduces duplicate assertions and aligns with lifecycle/matrix guidance. + +2. **Treat parser-smoke setup tests as removable when covered by specific parser arg tests** + - Delete `test_parser_creation` tests in files where per-subcommand arg tests already assert equivalent parser wiring. + - Rationale: avoid duplicate maintenance and preserve high-signal tests. + +3. **Move parsing assertions out of workflow-oriented git-like contract tests** + - Relocate revision classification and URI canonicalization checks from workflow files into the parsing matrix. + - Keep state-transition and delegation checks in workflow files. + - Rationale: separates grammar contracts from behavior contracts and narrows regression scope. + +4. **Retain user-visible CLI behavior checks even if lightweight** + - Preserve tests asserting output format/newline behavior and key top-level help sentinel coverage. + - Rationale: these are contract-relevant UX boundaries, not parser duplication. + +5. **Apply slow-marker policy to external-process execution paths** + - Mark tests that require adapter subprocess execution, polling loops, or runtime orchestration as `slow`. + - Keep pure in-memory contract checks in the fast path. + - Rationale: aligns test selection semantics with taxonomy and reduces non-slow wall time. + +6. **Collapse expensive adapter path duplicates into one matrix per contract family** + - Replace near-duplicate adapter execution tests with parameterized cases that preserve contract IDs and stage labels. + - Keep one high-signal representative per unique behavior boundary. + - Rationale: maintain contract parity while cutting repeated setup/execution overhead. + +## Risks / Trade-offs + +- **Risk:** Over-pruning may remove subtle edge-case coverage hidden in mixed workflow tests. -> **Mitigation:** move first, then delete only after parity matrix includes those cases. +- **Risk:** Centralized parsing matrix can become too broad and hard to read. -> **Mitigation:** group parameterized cases by surface (`parse_ref`, URI canonicalization, revision resolution) with clear IDs. +- **Risk:** Case-ID traceability may regress during migration. -> **Mitigation:** use direct canonical contract IDs in parameterized `id=` labels. +- **Trade-off:** Fewer local parser smoke tests means less immediate locality in some CLI files. -> **Mitigation:** keep command-level wiring tests and document parsing ownership in test naming. +- **Risk:** Moving tests to `slow` may hide regressions in default local runs. -> **Mitigation:** retain at least one fast representative per contract family and enforce full CI coverage with `slow` included. +- **Risk:** Matrix collapsing can accidentally drop edge variants. -> **Mitigation:** build explicit case inventory before deletion and verify parity from old-to-new mapping. + +## Migration Plan + +1. Add the new parsing contract matrix suite with all migrated parsing scenarios. +2. Move/port parsing assertions from CLI/base, GC helper parsing, and git-like workflow contract tests. +3. Identify external-process adapter/runtime orchestration tests in fast path and mark qualifying tests as `slow`. +4. Collapse duplicate expensive adapter-path tests into parameterized matrices with canonical IDs. +5. Remove redundant parser-smoke tests with demonstrated parity. +6. Run fast contract suite and targeted integration checks to confirm no behavioral coverage loss and improved non-slow runtime. + +## Open Questions + +- Should GC `parse_heads` remain partially local as a command contract, or be fully centralized with other parsing checks? +- Should lightweight parser smoke tests in status/config/contrib be removed now or in a follow-up cleanup once matrix migration lands? +- Which adapter-path tests must remain fast representatives versus being reclassified `slow` to satisfy both contract confidence and local-loop performance? diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/proposal.md b/openspec/changes/archive/2026-05-01-trim-redundant-tests/proposal.md new file mode 100644 index 0000000..ca76fc5 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/proposal.md @@ -0,0 +1,27 @@ +## Why + +The current contract test suite includes redundant parser smoke tests and duplicated revision/URI parsing assertions spread across CLI and ops files. This increases maintenance cost and makes failures noisy, while weakening the contract-first intent of the test taxonomy. + +## What Changes + +- Remove parser-creation smoke tests that are already covered by more specific parser argument tests in the same file. +- Consolidate revision/ref/URI parsing checks into a single parameterized contract-focused suite instead of repeating equivalent checks in multiple workflow tests. +- Reclassify external-process runtime-orchestration tests from fast contract selection into `slow` coverage where they match integration-style execution behavior. +- Collapse duplicate expensive adapter execution paths into parameterized stage/case matrices so one maintained suite covers each contract family without repeated near-identical runtime cost. +- Keep workflow tests focused on operational invariants (delegation, head movement, state transitions, and boundary errors) rather than parser grammar duplication. +- Preserve or improve traceability by using canonical contract IDs and parameterized case IDs for the consolidated parsing matrix. + +## Capabilities + +### New Capabilities +- `revision-parsing-contract-matrix`: Centralized, parameterized contract matrix for ref parsing, DML URI canonicalization, and revision resolution forms/error boundaries. + +### Modified Capabilities +- `test-contract-matrix`: Tighten migration/removal guidance for superseded redundant tests in contract suites by explicitly pruning duplicate parser checks once parity is confirmed. + +## Impact + +- Affected tests in `tests/contracts/internal/cli/**` and `tests/contracts/internal/ops/**`, primarily parser-smoke and revision-parsing overlap points. +- Affected fast-path runtime orchestration tests currently selected by `pytest -m "not slow"`, especially external-process adapter execution paths. +- New/updated OpenSpec test capability artifacts under `openspec/specs/`. +- No user-facing runtime/API behavior changes; scope is test structure and maintainability. diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/revision-parsing-contract-matrix/spec.md b/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/revision-parsing-contract-matrix/spec.md new file mode 100644 index 0000000..f09d45f --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/revision-parsing-contract-matrix/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: Revision and URI parsing contracts are centrally owned by one parameterized matrix suite +The repository SHALL define revision/ref/URI parsing behavior in one maintained contract test suite that uses parameterized case matrices rather than duplicating equivalent parsing assertions across workflow tests. + +#### Scenario: Parsing contract matrix is the single maintained owner +- **WHEN** maintained tests assert behavior for `parse_ref`, DML URI canonicalization, or revision-form resolution +- **THEN** those assertions are implemented in the centralized parsing contract matrix suite instead of being repeated across unrelated workflow contract files + +#### Scenario: Workflow contracts avoid duplicate parsing assertions +- **WHEN** a workflow contract test validates delegation, state transitions, or side-effect invariants +- **THEN** it uses canonical valid inputs and does not re-assert grammar-level parsing variants already covered by the parsing matrix + +### Requirement: Parsing matrix cases include canonical contract IDs and explicit case labels +The centralized parsing matrix SHALL encode each case with direct canonical contract IDs and readable case labels in parameterized IDs. + +#### Scenario: Parameterized parsing case includes direct canonical ID +- **WHEN** a parsing behavior case is defined via parameterization +- **THEN** the case `id=` includes a direct literal canonical contract ID and a human-readable case label + +#### Scenario: Parsing case failures remain traceable +- **WHEN** a parsing matrix case fails +- **THEN** the failing node identifier includes both the contract ID and case label needed to identify the exact parsing form boundary + +### Requirement: Revision-form matrix covers accepted and rejected local resolution boundaries +The centralized parsing matrix SHALL cover the accepted revision forms and local-only rejection boundaries required by commit/project revision resolution behavior. + +#### Scenario: Accepted revision forms resolve with expected classification +- **WHEN** the suite evaluates accepted revision forms (branch, tag, ancestry expression, direct commit id, explicit commit ref) +- **THEN** each form resolves to the expected classification and commit target for the fixture setup + +#### Scenario: Unfetched remote revision form fails with local-resolution boundary +- **WHEN** a `dml://...#` revision form is evaluated without corresponding local tracking state +- **THEN** resolution fails with the documented local-resolution boundary error indicating fetch is required diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/test-contract-matrix/spec.md b/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/test-contract-matrix/spec.md new file mode 100644 index 0000000..97d40ba --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/specs/test-contract-matrix/spec.md @@ -0,0 +1,28 @@ +## MODIFIED Requirements + +### Requirement: Legacy test suite is fully migrated and superseded tests are removed +The repository SHALL complete migration of maintained tests to the contract matrix setup and SHALL remove superseded legacy tests to avoid duplicate maintenance. + +#### Scenario: Superseded legacy tests are removed after parity +- **WHEN** a legacy test's contract coverage is represented by migrated contract-matrix tests +- **THEN** the legacy test is removed from maintained test paths + +#### Scenario: End state contains only maintained tests aligned to taxonomy +- **WHEN** migration is complete +- **THEN** maintained tests conform to taxonomy, canonical ID, lifecycle parameterization, and slow-marker requirements defined in this specification + +#### Scenario: Redundant parser smoke tests are removed once equivalent arg-level coverage exists +- **WHEN** a parser-creation smoke test duplicates parser argument assertions already maintained in the same suite +- **THEN** the redundant parser-creation smoke test is removed after parity verification + +#### Scenario: Duplicate revision parsing checks are removed after central matrix adoption +- **WHEN** revision/ref/URI parsing forms are covered by the centralized parsing contract matrix +- **THEN** duplicate parsing checks in workflow-oriented contract tests are removed and workflow tests remain focused on operational invariants + +#### Scenario: External-process orchestration tests are classified as slow +- **WHEN** a test requires subprocess execution, adapter polling loops, remote roundtrips, or equivalent runtime orchestration +- **THEN** the test is marked `slow` and excluded from `pytest -m "not slow"` selection + +#### Scenario: Expensive adapter-path duplicates are collapsed into parameterized matrices +- **WHEN** multiple maintained tests exercise the same adapter-path contract family with near-identical setup and assertions +- **THEN** they are consolidated into one parameterized matrix suite that preserves canonical contract IDs and behavior-stage traceability diff --git a/openspec/changes/archive/2026-05-01-trim-redundant-tests/tasks.md b/openspec/changes/archive/2026-05-01-trim-redundant-tests/tasks.md new file mode 100644 index 0000000..e738b60 --- /dev/null +++ b/openspec/changes/archive/2026-05-01-trim-redundant-tests/tasks.md @@ -0,0 +1,22 @@ +## 1. Build centralized parsing contract matrix + +- [x] 1.1 Create a new contract test suite for revision/ref/URI parsing with parameterized case IDs that include canonical contract IDs. +- [x] 1.2 Port parsing assertions from mixed workflow tests into the new matrix (ref parsing, URI canonicalization, revision-form classification, and local-resolution rejection boundaries). + +## 2. Prune redundant parser-smoke tests + +- [x] 2.1 Remove parser-creation smoke tests that are fully duplicated by subcommand argument parsing tests in the same file. +- [x] 2.2 Remove duplicate revision parsing checks from workflow-oriented contract tests once parity is covered by the matrix. + +## 3. Reclassify external-process tests and collapse expensive adapter duplicates + +- [x] 3.1 Inventory fast-path tests that invoke subprocess adapters, polling loops, or equivalent runtime orchestration and mark qualifying tests as `slow`. +- [x] 3.2 Consolidate duplicate expensive adapter-path tests into parameterized matrices while preserving canonical contract IDs and stage labels. +- [x] 3.3 Verify parity for collapsed adapter-path coverage against the removed/reclassified tests. + +## 4. Preserve and verify invariant-focused coverage + +- [x] 4.1 Keep workflow/delegation/state-transition tests intact and confirm they no longer assert parser grammar variants. +- [x] 4.2 Run targeted contract test files touched by this change and fix any coverage gaps introduced by consolidation. +- [x] 4.3 Run `pytest -m "not slow"` and confirm the fast contract path remains green after redundancy removal. +- [x] 4.4 Compare non-slow runtime before/after and record reduction against target. (Internal contract non-slow runtime: 59.89s -> 46.93s, ~21.6% faster, 7 adapter-path tests moved to slow) diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/.openspec.yaml b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/.openspec.yaml new file mode 100644 index 0000000..2988acf --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-02 diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/design.md b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/design.md new file mode 100644 index 0000000..857b7c8 --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/design.md @@ -0,0 +1,66 @@ +## Context + +Head and index pointer storage is currently handled in multiple internal ops modules. `CommitOps`, `IndexOps`, `RemoteOps`, and `BaseOps.get_ctx` all read or write `Head` and `Index` objects directly. That makes pointer lifecycle rules hard to enforce in one place, leaks storage refs outside `HeadOps`, and forces retryable stale-write handling to be reimplemented ad hoc. + +The refactor needs to preserve current transaction behavior. Callers such as `IndexOps.commit`, `put_literal`, and `start_fn` must be able to run larger workflows in a caller-owned transaction while still routing pointer operations through `HeadOps` public methods. + +## Goals / Non-Goals + +**Goals:** +- Make `HeadOps` the only internal module that creates, reads, updates, or deletes branch/index pointers in storage. +- Hide `Head` and `Index` refs and objects from all non-`HeadOps` callers. +- Add compare-and-swap style branch/index commit updates using `update_branch_commit` and `update_index_commit`. +- Raise a dedicated retryable conflict error with a `current_commit` attribute when the expected commit no longer matches storage. +- Preserve single-transaction caller workflows by letting public `HeadOps` methods accept `txn=None`. + +**Non-Goals:** +- Changing higher-level retry strategy or forcing callers to read and write in the same transaction. +- Redesigning DAG, commit, or adapter execution semantics. +- Preserving head/index ref exposure in API or CLI surfaces that can be simplified to branch names and opaque index ids. + +## Decisions + +### `HeadOps` owns all pointer persistence +All branch/index storage access will move behind `HeadOps` public methods. Other modules will work with branch names, opaque index ids, and commit refs only. + +Alternative considered: leave reads in shared helpers and centralize only writes. Rejected because reads also leak `Head`/`Index` refs and make it harder to keep caller contracts uniform. + +### Public methods accept optional `txn` +Each public method will accept `txn=None`. If a txn is provided, the method uses it. Otherwise it opens its own transaction and delegates to a private txn-required helper. + +Alternative considered: require callers to use private helpers for shared transactions. Rejected because the public API should remain the only caller entry point. + +### Commit updates use expected-current semantics +`update_branch_commit(name, old_commit, new_commit, txn=None)` and `update_index_commit(index_id, old_commit, new_commit, txn=None)` will only update storage when the current commit matches `old_commit`. + +Alternative considered: blind setter methods. Rejected because callers like `put_literal`, `start_fn`, and `commit` need a precise stale-write signal to retry safely. + +### Conflict reporting uses one dedicated repo error subclass +The stale-write path will raise a dedicated `DmlRepoError` subclass carrying only `current_commit`. + +Alternative considered: separate branch/index conflict types or richer payloads. Rejected as unnecessary because callers already know the target and expected/new commits. + +### Index creation is commit-based +`HeadOps` will expose `create_index(commit_ref, txn=None)` and keep internal ref generation private. Callers must supply the starting commit regardless of whether it originated from a branch or an argv-backed commit bootstrap flow. + +Alternative considered: multiple `create_index_from_*` entry points. Rejected because they duplicate pointer allocation concerns and widen `HeadOps` responsibilities. + +## Risks / Trade-offs + +- Pointer contract churn across many callers -> Update `CommitOps`, `IndexOps`, `RemoteOps`, API, CLI, and tests in one change. +- Hidden ref assumptions in tests -> Convert assertions to branch names, index ids, and commit refs. +- `BaseOps.get_ctx` currently assumes head/index-like refs -> Replace or narrow that helper so it no longer exposes `Head` objects outside `HeadOps`. +- API compatibility drift during ref removal -> Keep external behavior stable while changing only internal handle types where possible. + +## Migration Plan + +- Add the new `HeadOps` public methods and conflict error. +- Move branch/index storage access in `CommitOps`, `IndexOps`, and `RemoteOps` to those methods. +- Remove or refactor shared helpers that expose `Head`/`Index` objects. +- Update API, CLI, and tests to stop carrying head/index refs. +- Run contract and integration coverage for commit/head/index flows. + +## Open Questions + +- Whether index ids should be raw ids or a small opaque token object at API boundaries. +- Whether `HeadOps` should expose `describe_branch` / `describe_index` helpers later, or keep the public surface minimal for this change. diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/proposal.md b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/proposal.md new file mode 100644 index 0000000..6063126 --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/proposal.md @@ -0,0 +1,25 @@ +## Why + +Head and index persistence is currently spread across `HeadOps`, `CommitOps`, `IndexOps`, `RemoteOps`, and shared helpers. That leaks `Head` and `Index` storage details across the internal ops layer, makes pointer updates inconsistent, and leaves retryable stale-write handling undefined. + +## What Changes + +- Route all branch and index database reads, writes, creation, and deletion through `HeadOps` public methods. +- Stop exposing `Head` and `Index` objects, or refs to them, outside `HeadOps`. +- Add atomic branch/index commit update methods that require the caller to provide the expected current commit. +- Add a dedicated conflict error for stale branch/index updates with a `current_commit` attribute so callers can retry. +- Update internal callers to use branch names, opaque index ids, and commit refs instead of head/index refs. + +## Capabilities + +### New Capabilities +- `headops-pointer-management`: Internal branch/index pointer lifecycle and atomic commit update behavior owned by `HeadOps`. + +### Modified Capabilities +- `git-like-commit-ops`: Branch-targeted commit workflows must advance branches through `HeadOps` instead of direct head storage access. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/head.py`, `commit.py`, `index.py`, `remote.py`, `base_ops.py`, `__init__.py`, and user-facing API/CLI code that currently carries head/index refs. +- Affected tests: internal ops contract tests, integration tests for head/index flows, and any API/CLI tests that assume head/index refs are exposed. +- Affected internal API: branch and index callers will move to branch names, opaque index ids, commit refs, and `HeadOps` conflict-aware update methods. diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..6ca437a --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/git-like-commit-ops/spec.md @@ -0,0 +1,16 @@ +## ADDED Requirements + +### Requirement: Branch-targeted commit workflows update branches through HeadOps +The system SHALL perform branch advancement in git-like commit workflows through `HeadOps` public methods rather than direct head storage access. + +#### Scenario: Merge updates branch through HeadOps +- **WHEN** a branch-targeted merge needs to fast-forward or store a merge commit +- **THEN** the workflow advances the branch through `HeadOps` using the expected current commit and the new commit + +#### Scenario: Revert updates branch through HeadOps +- **WHEN** a branch-targeted revert creates a new commit +- **THEN** the workflow advances the branch through `HeadOps` rather than writing the head object directly + +#### Scenario: DAG checkout updates branch through HeadOps +- **WHEN** DAG checkout creates a new commit on a branch +- **THEN** the workflow advances the branch through `HeadOps` rather than writing the head object directly diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/headops-pointer-management/spec.md b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/headops-pointer-management/spec.md new file mode 100644 index 0000000..a0053b0 --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/specs/headops-pointer-management/spec.md @@ -0,0 +1,52 @@ +## ADDED Requirements + +### Requirement: HeadOps owns branch and index pointer persistence +The system SHALL route all branch and index storage creation, lookup, update, listing, and deletion through `HeadOps` public methods. + +#### Scenario: Non-HeadOps caller needs branch commit +- **WHEN** an internal caller needs the commit for a branch +- **THEN** it obtains that commit through a `HeadOps` public method instead of reading a `Head` object or head ref directly + +#### Scenario: Non-HeadOps caller needs index commit +- **WHEN** an internal caller needs the commit for an index +- **THEN** it obtains that commit through a `HeadOps` public method instead of reading an `Index` object or index ref directly + +### Requirement: HeadOps hides head and index refs from callers +The system SHALL keep branch and index refs internal to `HeadOps` and SHALL expose branch names, opaque index ids, and commit refs to non-`HeadOps` callers. + +#### Scenario: Branch-targeted workflow uses branch name +- **WHEN** an internal caller targets a branch +- **THEN** the caller interacts with `HeadOps` using the branch name rather than a head ref + +#### Scenario: Index-targeted workflow uses opaque index id +- **WHEN** an internal caller targets an index +- **THEN** the caller interacts with `HeadOps` using an opaque index id rather than an index ref + +### Requirement: HeadOps supports atomic commit updates for pointers +The system SHALL update branch and index commits through `update_branch_commit` and `update_index_commit` methods that require the caller to provide the expected current commit. + +#### Scenario: Expected commit matches +- **WHEN** a caller requests a branch or index commit update with the correct current commit +- **THEN** `HeadOps` stores the new commit atomically + +#### Scenario: Expected commit is stale +- **WHEN** a caller requests a branch or index commit update with an outdated current commit +- **THEN** `HeadOps` rejects the update and raises a dedicated conflict error + +### Requirement: Conflict error reports current commit for retries +The system SHALL raise a dedicated `DmlRepoError` subclass for stale branch/index updates, and that exception SHALL expose the correct `current_commit`. + +#### Scenario: Caller retries after stale index update +- **WHEN** `update_index_commit` fails because the stored commit changed +- **THEN** the raised conflict error includes the current stored commit for the caller to inspect and retry from + +### Requirement: HeadOps public methods support caller-owned transactions +The system SHALL allow callers to pass an existing transaction into `HeadOps` public methods, and SHALL create a transaction internally when one is not provided. + +#### Scenario: Caller provides transaction +- **WHEN** a caller invokes a `HeadOps` public method with `txn=` +- **THEN** the method performs its work within that transaction + +#### Scenario: Caller omits transaction +- **WHEN** a caller invokes a `HeadOps` public method without `txn=` +- **THEN** the method creates and uses its own transaction diff --git a/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/tasks.md b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/tasks.md new file mode 100644 index 0000000..6b9631f --- /dev/null +++ b/openspec/changes/archive/2026-05-02-route-head-index-db-through-headops/tasks.md @@ -0,0 +1,19 @@ +## 1. HeadOps pointer boundary + +- [x] 1.1 Add the new `HeadOps` public API for branch and index listing, creation, deletion, commit lookup, and commit updates with optional `txn` support. +- [x] 1.2 Add private txn-required helpers behind the public `HeadOps` methods. +- [x] 1.3 Add a dedicated stale-update `DmlRepoError` subclass with a `current_commit` attribute. +- [x] 1.4 Move internal head/index ref generation fully inside `HeadOps`. + +## 2. Internal caller migration + +- [x] 2.1 Refactor `CommitOps` branch-targeted workflows to use `HeadOps` branch methods instead of direct head storage access. +- [x] 2.2 Refactor `IndexOps` to use opaque index ids plus `HeadOps` commit lookup and `update_index_commit` flows. +- [x] 2.3 Refactor `RemoteOps` tracking-head writes to use `HeadOps` public methods. +- [x] 2.4 Remove or narrow shared helpers such as `BaseOps.get_ctx` that expose `Head` or `Index` objects outside `HeadOps`. + +## 3. Surface and test updates + +- [x] 3.1 Update API and CLI code to stop carrying head/index refs and use branch names, opaque index ids, and commit refs instead. +- [x] 3.2 Update contract and integration tests for the new `HeadOps` pointer boundary and stale-update conflict behavior. +- [x] 3.3 Run the relevant head, index, commit, remote, API, and CLI test coverage and fix any regressions. diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/.openspec.yaml b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/.openspec.yaml new file mode 100644 index 0000000..2988acf --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-02 diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/design.md b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/design.md new file mode 100644 index 0000000..0b16e28 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/design.md @@ -0,0 +1,104 @@ +## Context + +`HeadOps` currently persists branch and index pointers as DB objects (`Head`, `Index`) under LMDB namespaces and relies on DB transactions for storage-level atomicity. The target model replaces these pointers with filesystem refs under `.dml/refs` in the project directory. + +The design keeps stale-update correctness via expected-current commit checks and introduces file locks only around mutation sites. The release is intentionally breaking and excludes backward compatibility. + +Remote S3 protocol behavior remains unchanged. This change affects only local pointer/tracking storage representation. + +## Goals / Non-Goals + +**Goals:** +- Persist local heads/tags/indexes as files under `.dml/refs/local/{heads,tags,indexes}`. +- Persist remote-tracking refs as files under `.dml/refs/remote///{heads,tags}`. +- Manage local and remote-tracking branch/tag/index pointers through `HeadOps`. +- Store only commit IDs in pointer files. +- Keep stale-write detection through compare-and-swap style update methods in `HeadOps`. +- Use lock-scoped atomic file replacement for pointer mutations. +- Return commit refs directly from `list_pointer_roots`. +- Keep `pull_uri_into_branch` as `fetch_uri` + merge behavior. +- Keep GC traversal rooted in current refs (no root-argument redesign in this change). +- Remove `Head`/`Index` types and DB pointer namespaces. + +**Non-Goals:** +- Backward compatibility with prior DB pointer storage. +- Hybrid read/write behavior across both DB and filesystem pointer backends. +- Broader redesign of commit, DAG, or remote CAS semantics. +- Changes to S3 remote CAS/refs path schema, payload schema, push/fetch protocol, or remote GC behavior. + +## Decisions + +### Pointer storage layout is file-backed +`HeadOps` reads/writes pointer files in the project `.dml/refs` tree instead of storing pointer objects in LMDB. + +### URI strings are I/O shape, not local storage identity +`dml:///[#branch|@tag]` remains the user-facing parse/render format. Local remote-tracking pointers are persisted as filesystem paths under `.dml/refs/remote///{heads,tags}`. + +### Identifier character set is constrained for unambiguous path mapping +`owner`, `project`, `branch`, and `tag` identifiers are constrained to alphanumeric characters plus `-`, `*`, `|`, and `_` (`[A-Za-z0-9\-\*\|_]+`). This guarantees `` remains a single path segment with no escaping or slash handling. + +### Pointer payload format is commit-id only +Each pointer file stores the raw commit ID string (e.g. 64-char lowercase hex). `HeadOps` converts this into `Ref("commit:")` at API boundaries. + +### No `head:` / `index:` pointer-string surface +Callers and user-facing layers do not pass or expose `head:` or `index:` strings. Branch targeting uses plain branch names, index targeting uses opaque index ids, and commit targeting uses `commit:` refs where needed. + +### Lock only mutation sites +File locking is used around create/update/delete mutation paths. Read-only operations remain lock-free unless they are part of a mutation critical section. + +### Atomic updates use expected-current checks +`update_branch_commit` and `update_index_commit` continue to require an `old_commit` and reject stale writes with `DmlPointerConflictError(current_commit=...)`. + +### `list_pointer_roots` returns commit refs +GC/root traversal consumes commit refs directly, not pointer refs. + +### No migration/back-compat +Implementation proceeds as: implement file paths -> switch all pointer operations to file paths -> remove DB pointer paths/types. + +### Remote protocol remains unchanged +All S3 remote object/ref formats and remote operations remain as-is. Only local tracking ref persistence changes. + +## Proposed File Tree + +```text +/ + .dml/ + refs/ + local/ + heads/ + + tags/ + + indexes/ + + remote/ + / + / + heads/ + + tags/ + +``` + +## File Semantics + +- File content: `` (single commit id string, no `commit:` prefix). +- Missing pointer file: treated as missing pointer (`DmlRepoError` at `HeadOps` boundary). +- Commit existence: validated against LMDB commit namespace before create/update acceptance. +- Mutation writes: write temp file + atomic replace in same directory while holding lock. + +## Pull/Fetch/GC Semantics + +- `fetch_uri` materializes commit state and creates/updates matching local remote-tracking refs under `.dml/refs/remote/...`. +- `pull_uri_into_branch` remains `fetch_uri(uri)` followed by merge into the selected local branch. +- GC roots are derived from current refs; root-argument surface changes are outside this proposal. + +## Risks / Trade-offs + +- Pointer updates are no longer inside LMDB transaction boundaries. +- Lock discipline must be correct to avoid torn concurrent mutations. +- Tests that assert `head`/`index` namespace iteration will need replacement with filesystem ref assertions. + +## Open Questions + +- None at proposal time; path layout and payload format are fixed for this change. diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/proposal.md b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/proposal.md new file mode 100644 index 0000000..e605e78 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/proposal.md @@ -0,0 +1,57 @@ +## Why + +Branch and index pointers are currently persisted in LMDB namespaces (`head`, `index`) as typed objects (`Head`, `Index`). This keeps pointer identity coupled to DB object modeling when the desired model is filesystem refs under project-local `.dml/refs`. Moving pointers to files simplifies pointer inspection and keeps branch/index mutation rules in one file-backed ref layer. + +This change is intentionally breaking and does not provide backward compatibility with prior pointer storage. + +## What Changes + +- Replace local branch/tag/index pointer persistence with filesystem refs under `.dml/refs/local`, with `HeadOps` as the owner for branch/tag/index pointer management. +- Add local remote-tracking pointer persistence under `.dml/refs/remote//`. +- Store pointer file contents as raw commit IDs (no `commit:` prefix). +- Update `HeadOps` pointer read/write/list/update logic to be file-backed with mutation-site file locking and atomic write/replace. +- Change `list_pointer_roots` to return commit refs directly (`Ref("commit:")`) rather than pointer refs. +- Remove all `head:` / `index:` pointer-string usage across the codebase (including `_cli/*`, `ops/commit.py`, `ops/index.py`, and related tests/docs); branch and index are addressed by branch names and opaque index ids only. +- Keep S3 remote CAS/refs protocol and layout unchanged. +- Treat `dml:///[#branch|@tag]` as I/O boundary format only (user input / CLI output), while local tracking storage uses `.dml/refs/remote/...` paths. +- Constrain `owner`, `project`, `branch`, and `tag` identifiers to alphanumeric characters plus `-`, `*`, `|`, and `_` (`[A-Za-z0-9\-\*\|_]+`) so `` maps unambiguously to a single path segment. +- Keep internal `pull_uri_into_branch` semantics as `fetch_uri` followed by merge into the target local branch. +- Keep GC root behavior as traversal from current refs; user-provided root selection is not part of this change. +- Remove `Head` and `Index` types from internal type contracts and namespace registration. +- Remove `head` and `index` DB namespace usage entirely after file-backed flow is in place. + +## Proposed File Tree + +```text +/ + .dml/ + refs/ + local/ + heads/ + # file contents: <64-hex-commit-id> + tags/ + # file contents: <64-hex-commit-id> + indexes/ + # file contents: <64-hex-commit-id> + remote/ + / + / + heads/ + # file contents: <64-hex-commit-id> + tags/ + # file contents: <64-hex-commit-id> +``` + +## Capabilities + +### Modified Capabilities +- `headops-pointer-management`: pointer storage backend changes from DB objects to filesystem refs; roots become commit refs. + +### New Capabilities +- `file-backed-pointer-refs`: project-local ref directory structure, file content format, and locking/update semantics for local and remote-tracking refs. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/head.py`, `src/daggerml/_internal/types.py` (primary), plus all call sites/tests that assert DB `head`/`index` namespace behavior. +- Affected behavior: pointer persistence moves outside LMDB transaction atomicity; stale-update safety is preserved via expected-current commit checks plus lock-scoped atomic file replacement at mutation sites. +- Compatibility: no migration or compatibility layer; old DB pointer storage is removed in this release. diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/file-backed-pointer-refs/spec.md b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/file-backed-pointer-refs/spec.md new file mode 100644 index 0000000..d504554 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/file-backed-pointer-refs/spec.md @@ -0,0 +1,59 @@ +## ADDED Requirements + +### Requirement: Local pointer refs use filesystem paths under `.dml/refs/local` +The system SHALL persist local refs as files under `.dml/refs/local/{heads,tags,indexes}`. + +All `owner`, `project`, `branch`, and `tag` identifier values SHALL match `[A-Za-z0-9\-\*\|_]+`. + +#### Scenario: Local branch head path +- **WHEN** a caller resolves local branch `main` +- **THEN** the pointer path is `/.dml/refs/local/heads/main` + +#### Scenario: Local tag path +- **WHEN** a caller resolves local tag `v1` +- **THEN** the pointer path is `/.dml/refs/local/tags/v1` + +#### Scenario: Local index path +- **WHEN** a caller resolves local index id `abc123` +- **THEN** the pointer path is `/.dml/refs/local/indexes/abc123` + +### Requirement: Remote-tracking refs use filesystem paths under `.dml/refs/remote` +The system SHALL persist remote-tracking refs as files under `.dml/refs/remote///{heads,tags}`. + +`dml:///[#branch|@tag]` SHALL remain the user-facing parse/render shape for I/O and SHALL NOT require matching on-disk filename literals. + +#### Scenario: Remote-tracking branch path +- **WHEN** a caller resolves remote branch `dml://alice/demo#main` +- **THEN** the pointer path is `/.dml/refs/remote/alice/demo/heads/main` + +#### Scenario: Remote-tracking tag path +- **WHEN** a caller resolves remote tag `dml://alice/demo@v1` +- **THEN** the pointer path is `/.dml/refs/remote/alice/demo/tags/v1` + +### Requirement: Fetch updates local remote-tracking refs only +The system SHALL keep remote S3 protocol behavior unchanged and SHALL materialize fetched tracking state into local `.dml/refs/remote/...` files. + +#### Scenario: Fetch branch URI +- **WHEN** `fetch_uri("dml://alice/demo#main")` succeeds +- **THEN** local tracking file `/.dml/refs/remote/alice/demo/heads/main` is created or updated with the fetched commit id + +### Requirement: Pull into branch remains fetch then merge +The system SHALL implement pull-into-branch as fetch followed by merge. + +#### Scenario: Pull branch URI +- **WHEN** `pull_uri_into_branch(uri, branch, user=...)` is invoked +- **THEN** it fetches `uri` and merges the fetched commit into local branch `branch` + +### Requirement: Pointer payload format is raw commit ID +The system SHALL store only the commit ID string in each pointer file. + +#### Scenario: Read pointer payload +- **WHEN** reading a pointer file for a commit +- **THEN** file content is `` with no `commit:` prefix + +### Requirement: Pointer updates are lock-scoped and atomically replaced +The system SHALL apply lock-scoped mutation at pointer mutation sites and SHALL atomically replace pointer files for create/update operations. + +#### Scenario: Concurrent pointer updates +- **WHEN** two writers race to update the same pointer +- **THEN** only one update succeeds for a given expected-current value and stale writes are rejected diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/headops-pointer-management/spec.md b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/headops-pointer-management/spec.md new file mode 100644 index 0000000..4811c23 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/specs/headops-pointer-management/spec.md @@ -0,0 +1,43 @@ +## MODIFIED Requirements + +### Requirement: HeadOps owns branch and index pointer persistence +The system SHALL route all branch, tag, and index pointer creation, lookup, update, listing, and deletion through `HeadOps` public methods using filesystem refs. + +#### Scenario: Non-HeadOps caller needs branch commit +- **WHEN** an internal caller needs the commit for a branch +- **THEN** it obtains that commit through a `HeadOps` public method backed by `.dml/refs` files + +#### Scenario: Non-HeadOps caller needs index commit +- **WHEN** an internal caller needs the commit for an index +- **THEN** it obtains that commit through a `HeadOps` public method backed by `.dml/refs` files + +### Requirement: HeadOps hides head and index refs from callers +The system SHALL keep pointer file-path and pointer-ref details internal to `HeadOps` and SHALL expose branch names, opaque index ids, and commit refs to non-`HeadOps` callers. + +#### Scenario: Callers do not use `head:` or `index:` string forms +- **WHEN** internal or CLI callers target branches or indexes +- **THEN** they use plain branch names and opaque index ids, not `head:` or `index:` strings + +### Requirement: HeadOps supports atomic commit updates for pointers +The system SHALL update branch and index commits through `update_branch_commit` and `update_index_commit` methods that require the caller to provide the expected current commit. + +#### Scenario: Expected commit matches +- **WHEN** a caller requests a branch or index commit update with the correct current commit +- **THEN** `HeadOps` stores the new commit by atomically replacing the pointer file + +#### Scenario: Expected commit is stale +- **WHEN** a caller requests a branch or index commit update with an outdated current commit +- **THEN** `HeadOps` rejects the update and raises a dedicated conflict error + +### Requirement: Conflict error reports current commit for retries +The system SHALL raise a dedicated `DmlRepoError` subclass for stale branch/index updates, and that exception SHALL expose the correct `current_commit`. + +### Requirement: Pointer roots are commit refs +The system SHALL return commit refs directly from `HeadOps.list_pointer_roots`. + +#### Scenario: GC root collection +- **WHEN** callers request pointer roots for reachability traversal +- **THEN** `HeadOps.list_pointer_roots` returns commit refs gathered from all local heads and indexes + +### Requirement: GC root traversal remains current-ref based +The system SHALL keep garbage-collection root discovery based on current refs, and SHALL NOT require this change to introduce user-specified GC root arguments. diff --git a/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/tasks.md b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/tasks.md new file mode 100644 index 0000000..a01682e --- /dev/null +++ b/openspec/changes/archive/2026-05-03-file-backed-head-index-refs/tasks.md @@ -0,0 +1,23 @@ +## 1. File-backed pointer infrastructure in HeadOps + +- [x] 1.1 Add `.dml/refs` path resolution helpers in `HeadOps` for local heads/tags/indexes and remote-tracking heads/tags. +- [x] 1.1a Add/centralize identifier validation for `owner`, `project`, `branch`, and `tag` as `[A-Za-z0-9\-\*\|_]+`. +- [x] 1.2 Implement pointer file read/write helpers that map `` <-> `Ref("commit:")`. +- [x] 1.3 Add mutation-site file locking and atomic file replacement for create/update/delete pointer operations. +- [x] 1.4 Keep expected-current commit checks in update paths and raise `DmlPointerConflictError` with `current_commit` on stale writes. + +## 2. Switch runtime pointer flows to filesystem refs + +- [x] 2.1 Update branch/index list/get/create/delete/update methods in `HeadOps` to use file paths only. +- [x] 2.2 Update remote-tracking branch/tag pointer handling in `HeadOps` to use `.dml/refs/remote///{heads,tags}`. +- [x] 2.3 Change `list_pointer_roots` to return commit refs directly. +- [x] 2.4 Ensure `fetch_uri` updates local file-backed remote-tracking refs and `pull_uri_into_branch` remains `fetch + merge`. +- [x] 2.5 Remove `head:` / `index:` string usage from `_cli/*`, `ops/commit.py`, `ops/index.py`, and related call sites; use branch names and opaque index ids instead. + +## 3. Remove DB pointer model + +- [x] 3.1 Remove `Head` and `Index` classes from `src/daggerml/_internal/types.py`. +- [x] 3.2 Remove any remaining `head`/`index` DB namespace assumptions from `HeadOps` implementation. +- [x] 3.3 Update docs/tests/specs that assert DB pointer namespaces to assert filesystem refs and commit-id payload semantics. +- [x] 3.4 Keep S3 remote protocol behavior unchanged; verify no remote CAS/refs schema changes are introduced. +- [x] 3.5 Update docs/help/error messages to remove `head:` / `index:` examples and wording. diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/.openspec.yaml b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/.openspec.yaml new file mode 100644 index 0000000..905325f --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-04 diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/design.md b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/design.md new file mode 100644 index 0000000..0e8acfa --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/design.md @@ -0,0 +1,134 @@ +## Context + +The repository now stores branch and index refs as files under `.dml/refs/**`, but important `IndexOps` mutation paths still treat ref updates as if they were LMDB-internal pointer writes. That creates a mismatch between immutable commit creation in LMDB and mutable ref publication on the filesystem: `IndexOps` can write branch or index files while the LMDB transaction that created the target commit is still open. + +The desired ownership boundary is sharper than the current code: + +- `HeadOps` owns all interaction with `.dml/refs/**`. +- `IndexOps` owns commit derivation in LMDB. +- Ref publication happens only after LMDB commit success. + +This is cross-cutting because `HeadOps` API shape, `IndexOps` mutation sequencing, temporary builtin scratch-index flows, and pointer-conflict retry behavior all change together. + +## Goals / Non-Goals + +**Goals:** +- Ensure branch and index files never move to commits that failed to commit in LMDB. +- Keep `.dml/refs/**` access encapsulated inside `HeadOps`. +- Define a single optimistic publication pattern for affected `IndexOps` mutation paths: derive commit in LMDB, close transaction, CAS through `HeadOps`, retry on conflict. +- Remove temporary index ref files from builtin and failed-execution helper flows. + +**Non-Goals:** +- Introduce a journal, background replay system, or new ref persistence backend. +- Change public CLI or API semantics beyond conflict timing and retry behavior implied by the new publication order. +- Redesign unrelated branch, tag, or remote-tracking ref behavior. + +## Decisions + +### Decision: `HeadOps` remains the only `.dml/refs/**` boundary +All branch and index lookup, create, compare-and-swap update, listing, and deletion stays behind `HeadOps` public methods. + +Rationale: +- This preserves the existing architectural rule that callers work with branch names, index ids, and commit refs rather than raw ref paths. +- It prevents `IndexOps` or helper code from reintroducing direct filesystem access while fixing the transaction-ordering bug. + +Alternatives considered: +- Let `IndexOps` read/write ref files directly during retry loops. Rejected because it breaks encapsulation and duplicates stale-write handling. + +### Decision: non-bootstrap `HeadOps` pointer methods are file-I/O-only and transaction-free +`get_branch_commit`, `get_index_commit`, `update_branch_commit`, `update_index_commit`, `create_index`, `delete_index`, `list_branches`, `list_indexes`, and `list_pointer_roots` operate only on filesystem refs and pointer-file compare-and-swap state. They do not accept caller-owned transactions and do not open LMDB transactions for commit validation. + +Rationale: +- These methods represent file-backed ref operations, not DB mutations. +- Removing transaction participation prevents callers from assuming ref publication is atomic with LMDB writes. +- Retryable LMDB failures no longer risk leaving pointer-file side effects behind. + +Alternatives considered: +- Keep transaction arguments for commit existence checks. Rejected because the API shape invites ref mutation during active LMDB write transactions. + +### Decision: `create_branch` is the only transaction-aware `HeadOps` method +`create_branch(..., txn=...)` remains the single special case because bootstrap may need to create the initial commit/tree before publishing the branch file. Its transaction-ownership rule matches the current `HeadOps` pattern: `create_branch` closes the transaction if and only if it opened it. Its sequencing requirement is: create bootstrap commit state, finish the transaction that created that commit, then create the branch file. + +Rationale: +- Bootstrap is the only legitimate case where `HeadOps` still needs help creating LMDB state before it can publish a ref. +- Keeping this exception narrow avoids spreading transaction-aware pointer behavior across the rest of the API. + +Alternatives considered: +- Move bootstrap commit creation out of `HeadOps` entirely. Rejected for now to keep the change focused on ref-publication correctness rather than broader bootstrap API restructuring. + +### Decision: affected `IndexOps` mutation paths use optimistic post-transaction publication +For each affected mutation: + +1. Read the current base commit through `HeadOps`. +2. Open an LMDB write transaction. +3. Derive the next immutable commit snapshot from that base commit. +4. Close the LMDB transaction successfully. +5. Ask `HeadOps` to CAS the pointer from the expected old commit to the new commit. +6. If CAS fails with `DmlPointerConflictError`, restart using the conflict's `current_commit` as the new base commit instead of rereading from `.dml/refs/**` separately. + +Rationale: +- This preserves immutable commit construction while treating branch/index files as optimistic publication selectors. +- `DmlPointerConflictError.current_commit` already provides the minimal state needed for retry. + +Alternatives considered: +- Write ref changes first, then LMDB commit. Rejected because it recreates the existing corruption window. +- Journal ref intents inside LMDB. Rejected as unnecessary complexity for this change. + +Illustrative shape: + +```python +base_commit = head_ops.get_index_commit(index_id) +while True: + with self._tx(readonly=False) as txn: + ctx = txn.get_commit_ctx(base_commit) + new_commit = derive_next_commit(ctx, txn) + try: + head_ops.update_index_commit(index_id, base_commit, new_commit) + return new_commit + except DmlPointerConflictError as err: + base_commit = err.current_commit +``` + +For branch-backed finalization, the same pattern applies after commit derivation, except publication targets `update_branch_commit(...)` and index cleanup happens as a separate `HeadOps.delete_index(index_id)` step after successful publication. + +### Decision: index deletion and listing stay simple `HeadOps` operations +`delete_index` remains an unconditional `HeadOps` file-deletion operation, and index listing remains owned entirely by `HeadOps`. + +Rationale: +- Once index publication is no longer performed inside `IndexOps`, deletion is no longer part of an optimistic compare-and-swap contract. +- The change is about moving `.dml/refs/local/indexes/**` ownership entirely behind `HeadOps`, not adding extra concurrency semantics to listing or deletion. + +Alternatives considered: +- Add compare-and-delete semantics to `delete_index`. Rejected because the desired ownership model treats index deletion as plain `HeadOps` cleanup rather than an `IndexOps` publication step. + +### Decision: detached scratch commit helpers do not publish temporary index refs +Builtin execution and failed-execution helper flows build detached scratch commit state directly in LMDB and return the resulting DAG/commit refs without creating temporary index files. + +Rationale: +- Temporary index refs exist only to reuse index-mutation helpers, but they inherit the same unsafe publication assumptions. +- Detached scratch commit construction better matches the actual need in those flows. + +Alternatives considered: +- Keep temporary index refs and move only their publication outside transactions. Rejected because it preserves needless mutable ref churn and extra retry complexity. + +## Risks / Trade-offs + +- [Conflict retries can leave unreachable commits from failed publication attempts] -> Accept as a consequence of immutable optimistic publication; later GC can reclaim them. +- [Retry loops may rebuild commits multiple times under contention] -> Keep mutation logic deterministic from `(base_commit, operation args)` and reuse `current_commit` from conflicts to avoid extra ref reads. +- [Temporary-index helper removal may require new internal commit-building helpers] -> Limit new helpers to detached scratch construction and keep them private to `IndexOps`. +- [Bootstrap sequencing in `create_branch` may still be implemented inconsistently] -> Specify that `HeadOps` follows its existing ownership rule: it closes the transaction only when it opened it. + +## Migration Plan + +1. Narrow `HeadOps` pointer APIs so only `create_branch` remains transaction-aware, while index lookup/list/delete stay fully owned by `HeadOps`. +2. Convert affected `IndexOps` methods to the optimistic derive/commit/CAS/retry loop. +3. Replace temporary index-file helper flows with detached scratch commit construction. +4. Update tests to assert post-transaction publication order and stale-pointer retry behavior. +5. Run targeted internal ops and API tests covering index mutation, commit finalization, and builtin execution helpers. + +Rollback strategy: +- Revert the `IndexOps` publication-loop changes and restore prior `HeadOps` method signatures together, because mixed models would be inconsistent. + +## Open Questions + +- Which existing helper methods in `IndexOps` should be generalized for detached scratch commit construction versus left as mutation-specific code paths? diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/proposal.md b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/proposal.md new file mode 100644 index 0000000..9b97572 --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/proposal.md @@ -0,0 +1,40 @@ +## Why + +File-backed branch and index refs are no longer part of LMDB transaction atomicity, but `IndexOps` still advances them as if they were transactional state. That leaves live paths where `.dml/refs/**` can move to commits that never committed, and it keeps `HeadOps` coupled to caller-owned transactions in places that should now be file-I/O-only. + +## What Changes + +- Change `HeadOps` pointer-management requirements so branch and index pointer operations are file-backed CAS/delete operations that do not depend on caller-owned transactions. +- Preserve `create_branch(..., txn=...)` as the only transaction-aware `HeadOps` API, with the requirement that it create bootstrap commit state before writing the branch file. +- Add an `IndexOps` optimistic publication workflow that reads the current index commit through `HeadOps`, builds a new immutable commit in LMDB, closes the transaction, and then publishes it through `HeadOps` compare-and-swap. +- Require affected index mutation paths to retry from the current stored commit after `HeadOps` reports a stale pointer conflict. +- Remove the need for temporary index ref files in builtin and failed-execution helper flows by treating those paths as detached scratch commit construction rather than published mutable indexes. + +Example target shape: + +```python +base_commit = HeadOps(_db=self._db).get_index_commit(index_id) +while True: + with self._tx(readonly=False) as txn: + ctx = txn.get_commit_ctx(base_commit) + new_commit = derive_next_commit(ctx, txn) + try: + HeadOps(_db=self._db).update_index_commit(index_id, base_commit, new_commit) + break + except DmlPointerConflictError as err: + base_commit = err.current_commit +``` + +## Capabilities + +### New Capabilities +- `indexops-optimistic-ref-publication`: Defines how `IndexOps` derives commits from a `HeadOps`-provided base commit, publishes them through post-transaction CAS, and retries on conflicts. + +### Modified Capabilities +- `headops-pointer-management`: Narrow `HeadOps` transaction support so pointer lookup/update/delete stays inside `HeadOps` file I/O, while `create_branch` remains the only transaction-aware bootstrap entrypoint. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/head.py`, `src/daggerml/_internal/ops/index.py`, and any callers that currently pass `txn=` into non-bootstrap `HeadOps` methods. +- Affected behavior: index mutation and commit-finalization flows, index listing and deletion ownership, builtin scratch DAG creation, and pointer-conflict retry semantics. +- Affected contracts: internal `HeadOps` transaction boundaries, index publication sequencing, and stale-pointer retry behavior. diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/headops-pointer-management/spec.md b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/headops-pointer-management/spec.md new file mode 100644 index 0000000..4c77eb4 --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/headops-pointer-management/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: HeadOps public methods support caller-owned transactions +The system SHALL keep transaction-aware behavior limited to `create_branch`, and all other public `HeadOps` pointer-management methods SHALL operate without caller-owned transactions. + +#### Scenario: Caller provides transaction to create_branch +- **WHEN** a caller invokes `create_branch(..., txn=...)` +- **THEN** `HeadOps` uses that transaction only for bootstrap commit creation, closes it only if `HeadOps` opened it, and does not create the branch file until the transaction that created the commit has been closed successfully + +#### Scenario: Caller invokes non-bootstrap pointer method +- **WHEN** a caller invokes any `HeadOps` pointer lookup, listing, update, create-index, or delete-index method other than `create_branch` +- **THEN** the method performs only `.dml/refs/**` file I/O and stale-write checks without accepting or requiring a transaction or validating commit existence in LMDB + +#### Scenario: Index deletion remains plain HeadOps cleanup +- **WHEN** a caller asks `HeadOps` to delete an index ref +- **THEN** `HeadOps` removes the index file as a plain file operation and does not require compare-and-delete semantics diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/indexops-optimistic-ref-publication/spec.md b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/indexops-optimistic-ref-publication/spec.md new file mode 100644 index 0000000..ef3e3cd --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/specs/indexops-optimistic-ref-publication/spec.md @@ -0,0 +1,29 @@ +## ADDED Requirements + +### Requirement: IndexOps publishes index mutations through post-transaction compare-and-swap +The system SHALL have affected `IndexOps` mutation paths derive new commits in LMDB before publishing them through `HeadOps` compare-and-swap operations on file-backed index or branch refs. + +#### Scenario: Index mutation publishes after LMDB commit +- **WHEN** an `IndexOps` mutation updates an existing index state +- **THEN** it reads the base commit through `HeadOps`, writes the new immutable commit in an LMDB write transaction, closes that transaction, and only then asks `HeadOps` to compare-and-swap the index ref to the new commit + +### Requirement: IndexOps retries from the current stored commit after stale ref conflicts +The system SHALL retry affected `IndexOps` mutation paths when `HeadOps` reports a stale pointer conflict, using the conflict's current stored commit as the next base commit. + +#### Scenario: Index compare-and-swap loses a race +- **WHEN** `HeadOps.update_index_commit` rejects an `IndexOps` publication attempt with `DmlPointerConflictError(current_commit=commit:new)` +- **THEN** `IndexOps` starts a fresh LMDB write transaction using `commit:new` as the base commit and rebuilds the mutation before retrying publication + +### Requirement: Branch-targeted index commits publish branch movement after commit creation +The system SHALL publish branch advancement for `IndexOps.commit(..., head=...)` only after the new commit has been durably created in LMDB. + +#### Scenario: Branch-backed commit finalization +- **WHEN** `IndexOps.commit` finalizes a working index onto a branch +- **THEN** it writes the new commit in LMDB, closes the LMDB transaction, and only then asks `HeadOps` to advance the branch from the expected old commit to the new commit + +### Requirement: Detached scratch commit helpers do not create temporary index refs +The system SHALL build builtin and failed-execution scratch commit state without publishing temporary index refs under `.dml/refs/local/indexes`. + +#### Scenario: Builtin helper constructs scratch commit +- **WHEN** builtin execution needs a temporary DAG commit to materialize a result +- **THEN** the helper builds detached scratch commit state directly in LMDB and returns the resulting DAG/commit refs without creating or deleting a temporary index ref file diff --git a/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/tasks.md b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/tasks.md new file mode 100644 index 0000000..0cf34df --- /dev/null +++ b/openspec/changes/archive/2026-05-04-ref-cas-indexops-transition/tasks.md @@ -0,0 +1,18 @@ +## 1. HeadOps boundary changes + +- [x] 1.1 Narrow `HeadOps` public pointer methods so only `create_branch` remains transaction-aware. +- [x] 1.2 Update `HeadOps.create_branch` sequencing so bootstrap commit creation finishes before the branch file is written. +- [x] 1.3 Keep index lookup/list/delete fully owned by `HeadOps`, with delete remaining unconditional file cleanup. +- [x] 1.4 Adjust or add tests covering txn-free pointer lookup/update/delete behavior and bootstrap branch creation ordering. + +## 2. IndexOps optimistic publication + +- [x] 2.1 Refactor affected `IndexOps` mutation paths to derive new commits in LMDB and publish them through post-transaction `HeadOps` CAS. +- [x] 2.2 Implement stale-pointer retry loops that restart from `DmlPointerConflictError.current_commit` instead of direct ref-file access. +- [x] 2.3 Update `IndexOps.commit(..., head=...)` so branch advancement occurs only after LMDB commit success and index cleanup happens through `HeadOps.delete_index(...)` after publication. + +## 3. Scratch commit helpers and verification + +- [x] 3.1 Replace temporary index-ref helper flows in builtin and failed-execution paths with detached scratch commit construction. +- [x] 3.2 Update tests for builtin execution and failed-execution DAG construction so they assert no temporary index refs are published. +- [x] 3.3 Run targeted internal ops and API tests for index mutation, commit finalization, stale-pointer retries, and scratch helper behavior. diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/.openspec.yaml b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/.openspec.yaml new file mode 100644 index 0000000..eebe4d8 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-05 diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/design.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/design.md new file mode 100644 index 0000000..2ce2992 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/design.md @@ -0,0 +1,110 @@ +## Context + +The current repository model splits branch state across mutable branch refs, branch-qualified `remote.project`, and runtime/config overrides. That leaves `HEAD` as a caller-provided concept instead of a persisted repository object, makes detached behavior only partially modeled, and lets mutable project workflows derive their target branch from configuration rather than checkout state. + +This change replaces that model with a Git-like repository truth boundary: + +- `.dml/config.toml` stores branchless project identity and runtime defaults. +- `.dml/HEAD` stores the current checkout state. +- `.dml/refs/local/heads/` stores mutable local branch tips. +- Detached commits and tags remain immutable commit selectors. + +This is an intentional breaking change. The design does not preserve compatibility with branch-qualified local `remote.project`, `DML_BRANCH`, or mixed old/new checkout semantics. + +## Goals / Non-Goals + +**Goals:** + +- Make `.dml/HEAD` the sole persisted source of local checkout state. +- Separate project identity from branch selection by making local `remote.project` branchless. +- Make attached and detached checkout semantics explicit and testable. +- Require mutable project workflows to operate only from an attached local branch. +- Keep the Python API override surface available while making default repository behavior depend on `.dml/HEAD`. +- Remove all backward-compatibility paths for the old branch/config/env model. + +**Non-Goals:** + +- Preserving compatibility with repositories that still use branch-qualified local config. +- Supporting detached-HEAD branch advancement semantics. +- Allowing `push` or `pull` to mutate history from detached HEAD. +- Introducing a migration shim, auto-upgrade path, or dual-read support for old and new config formats. + +## Decisions + +### `.dml/HEAD` is a plain-text repository object owned by `HeadOps` + +`HeadOps` already owns file-backed pointer persistence, so it will also own the checkout-state file. The persisted `HEAD` payload has exactly two valid forms: + +- `ref: refs/local/heads/` +- `commit:` + +This keeps checkout state explicit, human-readable, and aligned with the branch/index pointer boundary already managed by `HeadOps`. + +Alternative considered: + +- Store a structured TOML/JSON HEAD object. Rejected because the state machine only needs two payload forms and plain text keeps parsing and manual inspection simpler. + +### Local project config becomes branchless identity only + +`[project].uri` in local config becomes `dml:///` with no branch or tag selector. Branch selection is no longer a configuration concern. `default_branch` remains a bootstrap and fallback selector for commands that need a branch when creating or fetching initial state, but it no longer represents the active checkout. + +Alternative considered: + +- Keep branch-qualified `remote.project` and add `.dml/HEAD` on top. Rejected because it preserves two competing sources of truth for the active branch and leaves detached semantics ambiguous. + +### Detached commits are immutable sources and do not advance `HEAD` + +When `.dml/HEAD` contains `commit:`, commands that create detached commits may materialize child commits, but `HEAD` remains unchanged and no branch ref moves. This matches the existing low-level `IndexOps.commit(head=None)` behavior and makes detached state a read/derive surface rather than a mutable line of development. + +Alternative considered: + +- Advance `.dml/HEAD` to the newly created detached commit. Rejected because it would make detached state partially mutable, blur the distinction between branches and immutable selectors, and complicate default push/pull semantics. + +### `HEAD` resolution moves from injected branch context to repository state + +Revision resolution will treat `HEAD` and `HEAD~n` as repository-state expressions backed by `.dml/HEAD`. Resolver entry points that currently depend on a caller-supplied current branch will instead resolve `HEAD` from the repository and then walk ancestry from the resolved commit. + +Alternative considered: + +- Keep `HEAD` as syntactic sugar for a caller-provided branch name. Rejected because it leaves `HEAD` disconnected from checkout state and breaks detached resolution. + +### Mutable project workflows require attached HEAD + +Project workflows that mutate a branch, especially `push` and `pull`, require `.dml/HEAD` to be attached to a local branch unless the command explicitly targets a mutable branch parameter. For default push behavior, attached branch `foo` maps to remote branch URI `dml:///#foo`. + +Alternative considered: + +- Allow mutable workflows from detached HEAD by implicitly using the last branch or default branch. Rejected because it reintroduces hidden branch-selection rules and makes history publication surprising. + +### The change is breaking with no compatibility fallback + +The implementation will reject old assumptions directly rather than silently translating them: + +- no `DML_BRANCH` +- no `[branch].current` +- no local branch-qualified `remote.project` +- no dual-resolution path that checks old config before `.dml/HEAD` + +Alternative considered: + +- Add compatibility reads and rewrite-on-save behavior. Rejected because it permanently complicates config resolution, checkout semantics, and failure modes for a model the repository is intentionally replacing. + +## Risks / Trade-offs + +- Old repositories become invalid under the new rules -> Mitigation: state the break explicitly in proposal/spec/tasks and update tests/docs to fail closed rather than silently translating state. +- Detached commits can become dangling and easy to lose -> Mitigation: make detached semantics explicit in status/checkout responses and contract tests so the behavior is intentional and visible. +- Multiple command surfaces currently assume branch defaults from config -> Mitigation: centralize `HEAD` read/resolve behavior in `HeadOps` and resolver code so CLI/API/project workflows do not each invent fallback rules. +- Keeping `Dml(branch=...)` means API callers can still bypass checkout state deliberately -> Mitigation: document that this is an explicit API override, not repository truth, and keep default runtime behavior aligned with `.dml/HEAD`. + +## Migration Plan + +There is no backward-compatible migration plan. + +- Repositories using the old branch-qualified local config model are out of contract once this change lands. +- `DML_BRANCH` is removed rather than deprecated. +- Any repository or automation that still depends on old config/env semantics must be rewritten manually outside product compatibility guarantees. +- Rollback, if needed during development, is code rollback before release rather than runtime dual-format support. + +## Open Questions + +- None for the core model. The repository truth boundary, detached semantics, `push` default, and no-compatibility stance are all decided. diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/proposal.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/proposal.md new file mode 100644 index 0000000..e88aa1d --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/proposal.md @@ -0,0 +1,35 @@ +## Why + +The current repository model conflates project identity, branch selection, and checkout state by deriving the active branch from config and environment inputs. That makes attached versus detached behavior implicit, keeps `HEAD` from being a real repository object, and leaves git-like project workflows harder to reason about than they need to be. + +## What Changes + +- Add a real `.dml/HEAD` file as the sole persisted source of local checkout state. +- Support two `HEAD` payload forms: `ref: refs/local/heads/` for attached mode and `commit:` for detached mode. +- Change local project config so `[project].uri` is branchless project identity only: `dml:///`. +- **BREAKING** Remove `DML_BRANCH` support entirely from runtime configuration, CLI behavior, hooks, and project workflows. +- **BREAKING** Remove config-derived current-branch behavior; commands that default to the current checkout MUST resolve it from `.dml/HEAD` instead of config. +- **BREAKING** Treat detached checkout state and tags as immutable sources: creating a child commit from a detached checkout does not move `HEAD` or any branch ref. +- Require mutable project workflows such as `push` and `pull` to operate only when `HEAD` is attached to a branch, with `push` defaulting to the attached branch's matching remote branch. +- Update revision resolution so `HEAD` and `HEAD~n` resolve through `.dml/HEAD` instead of an injected current-branch string. +- Explicitly reject backward-compatibility behavior: old branch-qualified local project config, `DML_BRANCH`, and mixed old/new checkout semantics are not supported by the new model. + +## Capabilities + +### New Capabilities + +None. + +### Modified Capabilities + +- `headops-pointer-management`: extend `HeadOps` to own persisted `.dml/HEAD` checkout state in addition to branch and index pointers. +- `shared-internal-configuration`: redefine project-local config so `remote.project` is branchless identity and branch selection is no longer a configuration concern. +- `git-like-commit-ops`: resolve checkout state from `.dml/HEAD`, formalize immutable detached commits, and require attached `HEAD` for mutable project workflows. +- `remote-project-refs`: remove `DML_BRANCH` and `[branch].current` assumptions from project-local config, init, hook environment, and project workflow defaults. +- `revision-parsing-contract-matrix`: update revision-resolution ownership so `HEAD` cases are defined by file-backed checkout state rather than config-derived branch context. + +## Impact + +- Affected code includes `_internal.config`, `HeadOps`, `CommitOps`, `IndexOps`, `DmlOps` project workflows, CLI project/status surfaces, and Python API default branch behavior. +- Existing repos using branch-qualified `.dml/config.toml` project URIs or relying on `DML_BRANCH` are incompatible with the new design unless manually rewritten outside any compatibility guarantee. +- Tests and docs covering checkout behavior, config precedence, revision parsing, init, hooks, and push/pull defaults must be updated to reflect the new repository model. diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/review.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/review.md new file mode 100644 index 0000000..3e41b98 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/review.md @@ -0,0 +1,53 @@ +## Implementation Review + +### Gaps and Deviations + +**GAP 1 — Dead `current_branch` parameter on `CommitOps.resolve_revision`** + +`resolve_revision(value, *, current_branch: str | None = None, project_dir: str = ".")` accepts `current_branch` but never uses it. HEAD always resolves through `get_head_state()`. This is the spec's explicitly-rejected "injected branch context" pattern still present as dead API surface. Any future caller could assume it works. Should be removed. + +**GAP 2 — Branch identifier regex too narrow** + +`HeadOps._IDENTIFIER_RE = re.compile(r"^[A-Za-z0-9\-\*\|_]+$")` rejects `/`, `.`, and other characters that are valid in ref names elsewhere in the codebase. Branch names like `feature/my-thing` or `v1.0.0` would be accepted by `create_branch` but then rejected when trying to write them to `HEAD` via `write_attached_head`. Silent, inconsistent narrowing not specified in the proposal. + +**GAP 3 — CLI help text contradicts the spec** + +`setup_init_parser` still shows `dml init --remote-project dml://alice/my-project#main` as an example. This is exactly the form the spec forbids. Validation rejects it at runtime, but the help text is misleading. + +**GAP 4 — `Dml.temporary()` overuses the branch override** + +`Dml.temporary()` passes `branch=branch` to `Dml(...)`, causing `_runtime_branch()` to always return the literal string rather than consulting `.dml/HEAD`. If a subsequent `checkout_project("HEAD~1")` moves HEAD to detached state, `self.branch` becomes stale. The design specifies `Dml(branch=...)` as a deliberate override, not a convenience shortcut — `temporary()` uses it as the latter with no documentation to clarify. + +**GAP 5 — `_default_project_branch` has a config-derived branch fallback** + +`_default_project_branch(branch)` resolves as `branch or attached_head or config.default_branch`. The third arm reaches `DmlConfig.default_branch`. The design does carve out `default_branch` for bootstrap/fetch scenarios, but this method is used in `_project_remote_root` for both `fetch_project` and `pull_project` remote URI construction with no documentation distinguishing the carve-out case from a mutable workflow context. + +**MINOR — `_project_home()` has a silent temp-dir fallback** + +If `_db.path` is not under a `.dml/db` layout, `_project_home()` silently falls back to a temp directory. The spec's "fails closed" stance calls for a hard error — a silent fallback means two `HeadOps` instances for the same repo could write HEAD to different paths. + +--- + +### Summary + +| Issue | Severity | +|---|---| +| Dead `current_branch` parameter on `resolve_revision` | Should be removed | +| Branch identifier regex too narrow — rejects `/`, `.` | Bug — inconsistent with rest of codebase | +| CLI help text shows branch-qualified URI example | Contradicts spec | +| `Dml.temporary()` sets `self.branch`, can go stale after checkout | Design deviation | +| `_default_project_branch` config fallback undocumented | Needs clarification | +| `_project_home()` temp-dir fallback | Contradicts "fails closed" stance | + +--- + +### Follow-up Status + +| Issue | Status | Resolution | +|---|---|---| +| Dead `current_branch` parameter on `resolve_revision` | Fixed | Removed `current_branch` from `CommitOps.resolve_revision` and `resolve_revision_ref`, and updated callers/tests. | +| Branch identifier regex too narrow — rejects `/`, `.` | Fixed | HEAD-attached branch validation now uses repo ref-name rules, and nested branch names are covered in tests. | +| CLI help text shows branch-qualified URI example | Fixed | `dml init` examples/help now use branchless `dml://owner/project` URIs. | +| `Dml.temporary()` sets `self.branch`, can go stale after checkout | Fixed | `Dml.temporary()` now returns a HEAD-driven runtime by default instead of pinning a branch override. | +| `_default_project_branch` config fallback undocumented | Not changed | Behavior remains intentional for fetch/bootstrap-style remote URI defaults; mutable workflows still require attached HEAD or explicit branch. | +| `_project_home()` temp-dir fallback | Fixed | `HeadOps._project_home()` now fails closed when the DB path is not a real `.dml/db` layout, and affected fixtures/scripts were updated. | diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..9d91028 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/git-like-commit-ops/spec.md @@ -0,0 +1,72 @@ +## MODIFIED Requirements + +### Requirement: Revision resolution +The system SHALL resolve revision values used by git-like commands to concrete local commit refs without performing network fetches. `HEAD` and ancestry expressions based on `HEAD` SHALL resolve through the repository's `.dml/HEAD` file. + +#### Scenario: Resolve branch shorthand +- **WHEN** a command receives `main` as a revision +- **THEN** the system resolves it as local branch `main` + +#### Scenario: Resolve remote-tracking branch shorthand +- **WHEN** a command receives `origin/main` as a revision +- **THEN** the system resolves it through the configured remote URI to local tracking ref `dml:///#main` + +#### Scenario: Resolve fetched DML branch URI +- **WHEN** a command receives `dml://alice/tools#main` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Resolve fetched DML tag URI +- **WHEN** a command receives `dml://alice/tools@v1.0` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Unfetched DML URI is not fetched implicitly +- **WHEN** a command receives `dml://alice/tools#main` as a revision and no matching local tracking ref exists +- **THEN** the command fails without contacting the remote + +#### Scenario: Resolve first-parent ancestry from HEAD file +- **WHEN** a command receives `HEAD~2` as a revision +- **THEN** the system resolves `HEAD` through `.dml/HEAD` and walks two first-parent steps from that resolved commit + +#### Scenario: Resolve local tag shorthand +- **WHEN** a command receives `v1.0` as a revision and `v1.0` resolves as a local tag +- **THEN** the system resolves it to the commit referenced by that tag + +### Requirement: Checkout repository state from revision +The system SHALL support checking out repository state from a resolved revision and SHALL distinguish branch-attached from detached checkouts by rewriting `.dml/HEAD`. + +#### Scenario: Checkout branch attaches runtime +- **WHEN** `dml checkout main` resolves `main` to a local branch +- **THEN** the system writes `.dml/HEAD` as `ref: refs/local/heads/main` and reports branch-attached checkout + +#### Scenario: Checkout tag detaches runtime +- **WHEN** `dml checkout v1.0` resolves `v1.0` to a tag target commit +- **THEN** the system writes `.dml/HEAD` as that detached commit and reports detached checkout at that commit + +#### Scenario: Checkout commit expression detaches runtime +- **WHEN** `dml checkout HEAD~1` resolves to a concrete commit +- **THEN** the system writes `.dml/HEAD` as that detached commit and reports detached checkout at that commit + +#### Scenario: Commit while detached does not advance branch or HEAD +- **WHEN** a user checks out a non-branch revision and then runs commit flow through `IndexOps.commit` +- **THEN** the system may create the new detached commit but does not advance any branch head and does not rewrite `.dml/HEAD` + +#### Scenario: Checkout unresolved remote URI fails locally +- **WHEN** `dml checkout dml://alice/tools#main` is requested and no local tracking ref exists for that URI +- **THEN** checkout fails without implicit fetch and reports that the revision cannot be resolved locally + +## ADDED Requirements + +### Requirement: Mutable project workflows require an attached branch +The system SHALL require `.dml/HEAD` to be attached to a local branch before default project workflows mutate branch history or publish a branch tip. + +#### Scenario: Push uses attached HEAD branch by default +- **WHEN** `.dml/HEAD` is attached to local branch `foo` and the user runs project push without an explicit branch override +- **THEN** the system pushes local branch `foo` to remote branch URI `dml:///#foo` + +#### Scenario: Pull requires attached HEAD +- **WHEN** `.dml/HEAD` is detached and the user runs project pull without an explicit mutable branch target +- **THEN** the command fails instead of selecting a branch from config or environment + +#### Scenario: Merge requires attached HEAD when defaulting destination +- **WHEN** `.dml/HEAD` is detached and the user runs a merge workflow that would otherwise target the current branch +- **THEN** the command fails because the current checkout is not a mutable branch target diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/headops-pointer-management/spec.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/headops-pointer-management/spec.md new file mode 100644 index 0000000..22dc123 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/headops-pointer-management/spec.md @@ -0,0 +1,38 @@ +## ADDED Requirements + +### Requirement: HeadOps owns persisted checkout state +The system SHALL route `.dml/HEAD` creation, parsing, update, and commit resolution through `HeadOps` public methods rather than allowing callers to read or write the checkout-state file directly. + +#### Scenario: Non-HeadOps caller needs current checkout state +- **WHEN** an internal caller needs to know whether the repository is attached or detached +- **THEN** it obtains that state through a `HeadOps` public method instead of reading `.dml/HEAD` directly + +#### Scenario: Repository bootstrap creates attached HEAD +- **WHEN** repository initialization creates the initial local branch +- **THEN** `HeadOps` persists `.dml/HEAD` as `ref: refs/local/heads/` for that branch + +### Requirement: HeadOps persists HEAD using two plain-text forms only +The system SHALL persist `.dml/HEAD` using exactly one of two plain-text payload forms: `ref: refs/local/heads/` for attached mode or `commit:` for detached mode. + +#### Scenario: Attached HEAD is written +- **WHEN** a checkout operation attaches to local branch `feature` +- **THEN** `.dml/HEAD` contains exactly `ref: refs/local/heads/feature` + +#### Scenario: Detached HEAD is written +- **WHEN** a checkout operation detaches at commit `commit:abc123` +- **THEN** `.dml/HEAD` contains exactly `commit:abc123` + +#### Scenario: Invalid HEAD payload fails closed +- **WHEN** `.dml/HEAD` contains any other payload form +- **THEN** `HeadOps` rejects the repository state and does not guess an alternate checkout target + +### Requirement: HeadOps resolves HEAD to the active commit +The system SHALL resolve `.dml/HEAD` to a concrete commit ref by following the attached local branch ref or by returning the detached commit directly. + +#### Scenario: Attached HEAD resolves through local branch ref +- **WHEN** `.dml/HEAD` contains `ref: refs/local/heads/main` +- **THEN** `HeadOps` resolves HEAD to the commit stored at `.dml/refs/local/heads/main` + +#### Scenario: Detached HEAD resolves directly +- **WHEN** `.dml/HEAD` contains `commit:abc123` +- **THEN** `HeadOps` resolves HEAD to `commit:abc123` without consulting any branch ref diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/remote-project-refs/spec.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..72aa0d1 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/remote-project-refs/spec.md @@ -0,0 +1,116 @@ +## MODIFIED Requirements + +### Requirement: Local remote config +The system SHALL store project-local config under `.dml/config.toml` containing branchless project identity and remote storage settings. The current checkout branch MUST NOT be stored in local config. + +#### Scenario: Resolve origin main +- **WHEN** local config defines project identity `dml://alice/demo` and the attached local branch is `main` +- **THEN** `dml push` resolves the default remote target as project owner `alice`, project `demo`, and branch `main` + +#### Scenario: Project fields are stored +- **WHEN** local project config is written for project `alice/demo` +- **THEN** `.dml/config.toml` contains `[project].uri = "dml://alice/demo"` and does not contain branch-selection fields + +#### Scenario: Remote fields are stored +- **WHEN** local project config records the remote storage URI for project `alice/demo` +- **THEN** `.dml/config.toml` contains the configured `[remote]` fields and no local checkout branch field + +#### Scenario: Reject branch-qualified local project URI +- **WHEN** local config would store `dml://alice/demo#main` or `dml://alice/demo@v1` +- **THEN** config validation fails without writing the selector-bearing URI + +### Requirement: Config waterfall precedence +The system SHALL resolve configurable values using explicit CLI/API arguments first, environment variables second, and config file values last. Checkout-state selection is not part of this waterfall and SHALL be resolved from `.dml/HEAD`. + +#### Scenario: Explicit value wins over environment +- **WHEN** a command receives an explicit mutable branch argument and environment variables also provide configuration inputs +- **THEN** the command uses the explicit branch argument for that mutable branch target + +#### Scenario: Environment does not override checkout state +- **WHEN** a command omits an explicit branch argument and environment variables are resolved +- **THEN** the command still derives the current checkout from `.dml/HEAD` rather than from configuration environment variables + +#### Scenario: Config used as fallback for non-checkout values +- **WHEN** a command omits explicit overrides and no matching environment value is set +- **THEN** the command uses configured values such as `remote.project`, `remote.uri`, or `default_branch` but not a config-derived current branch + +#### Scenario: Remote storage env vars override config +- **WHEN** `DML_REMOTE_BUCKET` or `DML_REMOTE_PREFIX` is set for a remote operation +- **THEN** the command uses the environment value instead of the configured remote storage field + +### Requirement: Supported DML environment variables +The system SHALL support only the DML environment variables defined for the project model and SHALL treat hook context variables as output-only process context. `DML_BRANCH` is not a supported environment variable. + +#### Scenario: Global config home override +- **WHEN** `DML_CONFIG_HOME` is set +- **THEN** the global DML config directory resolves from `DML_CONFIG_HOME` + +#### Scenario: Existing user env remains supported +- **WHEN** `DML_USER` is set and an owner is omitted +- **THEN** the system uses `DML_USER` as the default project owner + +#### Scenario: DML_BRANCH is rejected as unsupported +- **WHEN** `DML_BRANCH` is set during project or runtime command resolution +- **THEN** the system does not use it as checkout state or branch selection input + +#### Scenario: Project env overrides config +- **WHEN** `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, or `DML_REMOTE_PROJECT` is set +- **THEN** the corresponding supported project config value is overridden for that command + +#### Scenario: Remote env overrides config +- **WHEN** `DML_REMOTE`, `DML_REMOTE_ROOT`, `DML_REMOTE_BUCKET`, or `DML_REMOTE_PREFIX` is set +- **THEN** the corresponding remote selection or storage value is overridden for that command + +#### Scenario: Hook context env is provided by DML +- **WHEN** a hook command runs +- **THEN** DML sets `DML_HOOK`, `DML_PROJECT_HOME`, and, for clone hooks, `DML_REMOTE_NAME` + +### Requirement: Project directory initialization +The system SHALL initialize local project state under `/.dml/` for `init`. + +#### Scenario: Init creates DML directory +- **WHEN** `dml init demo` succeeds +- **THEN** the system creates `demo/.dml/`, `demo/.dml/config.toml`, `.dml/HEAD`, and local database storage under `demo/.dml/db/` + +#### Scenario: Init refuses existing child directory +- **WHEN** `dml init demo` runs and `demo/` already exists +- **THEN** init fails and instructs the user to initialize that directory with `dml init --here demo` + +#### Scenario: Init here creates DML directory in current directory +- **WHEN** `dml init --here demo` succeeds from the current directory +- **THEN** the system creates `.dml/`, `.dml/config.toml`, `.dml/HEAD`, and local database storage under `.dml/db/` + +#### Scenario: Init here uses provided project name +- **WHEN** `dml init --here demo` succeeds from directory `workdir` +- **THEN** the local project name is `demo` + +#### Scenario: Init creates DML gitignore +- **WHEN** `dml init demo` succeeds +- **THEN** the system writes `demo/.dml/.gitignore` containing `*` + +#### Scenario: Init creates initial branch and attaches HEAD +- **WHEN** `dml init demo` succeeds +- **THEN** local storage contains an initial empty commit/tree, local branch `main`, and `.dml/HEAD` attached to `main` + +### Requirement: Init shell hooks +The system SHALL support `post-init` shell hooks from global DML config that run in the project directory after `.dml/` exists. + +#### Scenario: Init hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init demo` runs +- **THEN** the hook command runs in the `demo` project directory after `demo/.dml/` exists + +#### Scenario: Init here hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init --here demo` runs +- **THEN** the hook command runs in the current directory after `.dml/` exists + +#### Scenario: Hooks run in configured order +- **WHEN** multiple `post-init` hook commands are configured and `dml init demo` runs +- **THEN** the hook commands run in their configured list order + +#### Scenario: Init no-hooks skips hooks +- **WHEN** `dml init --no-hooks demo` runs +- **THEN** no `post-init` hook commands run + +#### Scenario: Hook environment omits removed branch env +- **WHEN** a `post-init` hook command runs +- **THEN** the process environment includes `DML_HOOK`, `DML_PROJECT_HOME`, `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, and `DML_CONFIG_HOME`, and does not include `DML_BRANCH` diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/revision-parsing-contract-matrix/spec.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/revision-parsing-contract-matrix/spec.md new file mode 100644 index 0000000..58de878 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/revision-parsing-contract-matrix/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Revision-form matrix covers accepted and rejected local resolution boundaries +The centralized parsing matrix SHALL cover the accepted revision forms and local-only rejection boundaries required by commit/project revision resolution behavior, including file-backed `HEAD` semantics. + +#### Scenario: Accepted revision forms resolve with expected classification +- **WHEN** the suite evaluates accepted revision forms (branch, tag, ancestry expression, direct commit id, explicit commit ref, and `HEAD` backed by `.dml/HEAD`) +- **THEN** each form resolves to the expected classification and commit target for the fixture setup + +#### Scenario: Detached HEAD ancestry resolves from HEAD file +- **WHEN** `.dml/HEAD` contains a detached commit payload and the suite evaluates `HEAD~1` +- **THEN** resolution walks ancestry from the detached commit stored in `.dml/HEAD` + +#### Scenario: Unfetched remote revision form fails with local-resolution boundary +- **WHEN** a `dml://...#` revision form is evaluated without corresponding local tracking state +- **THEN** resolution fails with the documented local-resolution boundary error indicating fetch is required diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..790202f --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/specs/shared-internal-configuration/spec.md @@ -0,0 +1,27 @@ +## MODIFIED Requirements + +### Requirement: Canonical config parameters are reduced to one normalized set +The system SHALL normalize supported configuration inputs into the canonical internal parameters `project.home`, `remote.project`, `db.path`, `remote.uri`, `user`, `default_branch`, `hooks.post-init`, `hooks.post-clone`, and `config_home`. + +#### Scenario: Branch context is not a canonical config parameter +- **WHEN** project configuration is resolved +- **THEN** the canonical internal model does not include a separate branch-selection parameter and does not derive the active checkout branch from configuration + +#### Scenario: Legacy overlapping remote parameters are not canonical +- **WHEN** remote-backed configuration is resolved +- **THEN** the canonical remote parameter is `remote.uri` rather than separate `remote.root`, `remote.bucket`, or `remote.prefix` parameters + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize and canonicalize local `remote.project` as a branchless project identity through shared revision URI utilities. Resolved configuration SHALL treat checkout state as repository state owned by `.dml/HEAD` rather than as a selector embedded in config. + +#### Scenario: Local project URI remains branchless +- **WHEN** `remote.project` is resolved for local project configuration +- **THEN** shared configuration preserves canonical branchless form `dml:///` + +#### Scenario: Tag or branch selector is not accepted for local project config +- **WHEN** local project configuration provides `remote.project` with a branch or tag selector +- **THEN** configuration resolution fails instead of translating that selector into checkout state + +#### Scenario: Project helper accessors do not expose current checkout branch +- **WHEN** resolved configuration includes `remote.project` +- **THEN** helper accessors expose project identity only and do not treat config as the source of the active branch or detached commit diff --git a/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/tasks.md b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/tasks.md new file mode 100644 index 0000000..040dad4 --- /dev/null +++ b/openspec/changes/archive/2026-05-05-add-head-file-checkout-state/tasks.md @@ -0,0 +1,29 @@ +## 1. Repository state model + +- [x] 1.1 Extend `HeadOps` with public `.dml/HEAD` read/write/resolve methods for attached and detached payloads. +- [x] 1.2 Update repository bootstrap and init flows to create `.dml/HEAD` attached to the initial local branch. +- [x] 1.3 Add contract and integration coverage for valid attached/detached HEAD payloads and invalid HEAD failure modes. + +## 2. Configuration and project layout + +- [x] 2.1 Remove `DML_BRANCH` and any branch-selection config normalization from shared internal configuration resolution. +- [x] 2.2 Change local `DmlProjectConfig` persistence and validation so `remote.project` is branchless and local config no longer stores branch state. +- [x] 2.3 Update init, hook environment, and status/config surfaces to reflect branchless project config and the removal of `DML_BRANCH`. + +## 3. Revision and checkout behavior + +- [x] 3.1 Refactor revision resolution so `HEAD` and `HEAD~n` resolve through `.dml/HEAD` instead of a caller-supplied current branch. +- [x] 3.2 Update repository checkout workflows to rewrite `.dml/HEAD` for attached and detached modes. +- [x] 3.3 Preserve immutable detached semantics so detached commits do not advance any branch ref or rewrite `.dml/HEAD`. + +## 4. Mutable workflow gating + +- [x] 4.1 Update project push defaults so attached local branch `foo` publishes to `dml:///#foo`. +- [x] 4.2 Require attached HEAD or an explicit mutable branch target for project pull, merge, revert, and similar branch-mutating workflows. +- [x] 4.3 Keep the Python API `Dml(branch=...)` override available while making default runtime behavior derive checkout state from `.dml/HEAD`. + +## 5. Documentation and verification + +- [x] 5.1 Rewrite affected OpenSpec-backed tests for config precedence, init, revision parsing, checkout, and push/pull behavior under the new model. +- [x] 5.2 Update repository docs to state the breaking change explicitly, including the lack of backward compatibility for old local config and `DML_BRANCH`. +- [x] 5.3 Run the relevant test suites covering config, head ops, revision parsing, project workflows, and API defaults. diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/.openspec.yaml b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/.openspec.yaml new file mode 100644 index 0000000..2188dbd --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-06 diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/design.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/design.md new file mode 100644 index 0000000..ad02f4e --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/design.md @@ -0,0 +1,197 @@ +## Context + +Remote execution lineage is currently keyed by cache key and spread across multiple mutable S3 indexes. That makes administrative graph operations awkward because cache invalidation and cancellation need transitive closure over callers, but the existing storage duplicates forward and reverse lineage in shared JSON objects. Repeated `start_fn` polling also means dependencies are discovered incrementally rather than all at first launch. + +This change moves graph identity to `execution_id`, keeps S3 as the source of truth, and assumes users will periodically ingest remote execution objects into a local database to plan rare manual invalidation and cancellation operations. + +## Goals / Non-Goals + +**Goals:** +- Represent dependency history using immutable execution-edge records keyed by execution id. +- Keep remote writes simple and S3-friendly for many asynchronous writers. +- Make cache invalidation and cancellation planning straightforward in a local SQL database. +- Preserve retry and rerun semantics by treating cache refs as pointers to current executions rather than as graph node identities. + +**Non-Goals:** +- Introduce a shared remote transactional database. +- Define a fully automated background reconciler; planning remains a user-driven local operation. + +This design explicitly rejects any backward compatibility with the existing cache-key lineage layout. The new execution-id-based model replaces the prior remote structure wholesale. + +## Decisions + +### Use execution id as graph-node identity +Execution lineage, invalidation markers, and cancellation markers will all be keyed by `execution_id`. `cache_key` remains the computation identity used for lock ownership and cache lookup, but no longer serves as the lineage node identity. + +This decision provides no backward compatibility with cache-key lineage identities. The old lineage model is explicitly unsupported by this change. + +Why this approach: +- reruns naturally create new graph nodes without reinterpreting historical edges +- invalidation and cancellation stay precise across retries +- execution history remains immutable and auditable + +Alternative considered: +- continue using cache-key graph nodes and recursively rewrite cache-key lineage objects. Rejected because cache keys are reused across attempts and make administrative graph semantics ambiguous. + +### Store canonical reverse edges in S3 +The source-of-truth dependency relation will be stored as immutable objects at `exec/edges//.json`. A caller writes this edge when it concretely discovers the dependency, even if discovery happens on a later poll. + +This storage layout is a clean replacement for the old call-edge layout. No backward-compatible reads or writes to the prior lineage paths are allowed. In particular, nothing in the new implementation reads from `calls/from/...` or `calls/to/...`. + +Why this approach: +- invalidation needs reverse-caller traversal, so listing by callee is the hot administrative query +- object creation is idempotent because the path is canonical +- no shared read/merge/write reverse index is needed + +Alternatives considered: +- store edges by caller and rebuild reverse closure by scanning all edges. Rejected because manual invalidation would require expensive reverse scans. +- store both directions as shared mutable indexes. Rejected because multi-writer maintenance is error-prone on S3. + +### Use one mutable execution object per execution id +`exec/state/.json` becomes the single execution object, updated with compare-and-swap semantics and monotone merges. It stores durable execution record fields such as `created_at`, plus runtime status, current durable adapter `state`, discovered `dependencies`, and `cancel_requested_by` when cancellation is requested. + +This execution object replaces all prior execution-record and live-summary shapes without a compatibility layer. + +Allowed execution `status` values are `running`, `cancel-requested`, `cancelled`, `succeeded`, and `failed`. + +The `exec/state/.json` schema is: + +```json +{ + "execution_id": "E1", + "cache_key": "ck1", + "created_at": 1760000000, + "status": "running", + "state": {}, + "dependencies": ["E2", "E3"], + "updated_at": 1760000000, + "cancel_requested_by": null +} +``` + +Why this approach: +- the split record/live model was not buying enough value to justify extra storage and coordination +- one execution object keeps durable record fields, adapter state, status, and dependencies in one place +- compare-and-swap on one execution-owned summary is simpler than mutating shared graph indexes + +Alternative considered: +- split immutable records from mutable live summaries. Rejected because the only immutable fields were trivial and the split complicated reads and writes. + +### Treat cache refs as projections onto execution history +`refs/cache/.json` will remain a normal cache ref to the current manifest for that cache key, and SHALL also record the current `execution_id` for graph planning. Invalidation starts by resolving a cache key to its current execution id from that ref metadata, then walking the execution graph locally. + +This cache-ref meaning changes as part of the clean replacement. The old cache-key lineage interpretation is not preserved. Cache publication is create-only per cache key path; reruns must invalidate the current cache ref before a later execution republishes that cache key. + +Why this approach: +- the cache remains a current-view pointer rather than the graph itself +- invalidating one historical execution clears the cache-key path so a later rerun can republish a fresh execution id + +Alternative considered: +- invalidate cache keys directly as graph identities. Rejected because cache keys alias multiple executions over time. + +### Plan invalidation and cancellation locally, commit control state remotely +Users will ingest `exec/state`, `exec/edges`, and `refs/cache` into a local database. Invalidation writes immutable `exec/invalidate/.json` tombstones for the reverse caller closure and deletes cache refs that point at invalidated executions. Cancellation updates `exec/state/.json` to `cancel-requested` for the requested execution and any propagated closure where no live callers remain. + +These admin flows operate only on the new execution-id layout. There is no backward-compatible support for the prior cache-key lineage paths, and no admin flow reads from `calls/from/...` or `calls/to/...`. + +The invalidation walk uses the current cache projection as a guardrail: + +```text +seen = [] +seen_set = set() +unseen = set(ref.execution_id for each existing user-requested cache ref) + +while unseen: + exec_id = pop(unseen) + state = read exec/state/.json + if missing: + continue + cache_ref = read refs/cache/.json + if missing: + continue + if cache_ref.execution_id != exec_id: + continue + seen.append(exec_id) + seen_set.add(exec_id) + unseen |= callers(exec_id) - seen_set + +for exec_id in reversed(seen): + create exec/invalidate/.json with create-once/CAS semantics + delete refs/cache/.json only if it still points to exec_id +``` + +This ensures historical executions are skipped once a cache key has advanced to a newer execution id, and that cache-ref deletion does not race away a newer publication. + +The cancellation walk is the forward dual over execution dependencies: + +```text +seen = [] +seen_set = set() +unseen = set(user_requested_exec_ids) + +while unseen: + exec_id = pop(unseen) + state = read exec/state/.json + if missing: + continue + if state.status in {"succeeded", "failed", "cancelled"}: + continue + seen.append(exec_id) + seen_set.add(exec_id) + unseen |= set(state.dependencies) - seen_set + +for exec_id in reversed(seen): + state = reread exec/state/.json + if missing: + continue + if state.status in {"succeeded", "failed", "cancelled"}: + continue + caller_count = number of callers of exec_id + whose state exists + and whose status is not in {"cancel-requested", "cancelled", "succeeded", "failed"} + if caller_count > 1: + continue + CAS update exec/state/.json to status = "cancel-requested" + and set cancel_requested_by +``` + +This yields inside-out cancellation: dependencies are processed before their callers, terminal executions are pruned, and a dependency is skipped while more than one uncancelled caller still points to it. + +The `exec/invalidate/.json` schema is: + +```json +{ + "execution_id": "E4", + "cache_key": "ck1", + "requested_by": "alice@example.com", + "requested_at": 1760000000 +} +``` + +Why this approach: +- closure queries and live-caller checks are easy in SQL and awkward in S3 +- S3 remains append-mostly for writers and marker-based for admins +- user-driven admin actions are infrequent, so local planning is acceptable + +Alternative considered: +- maintain a shared mutable database on S3. Rejected because whole-database compare-and-swap creates poor concurrency and brittle failure modes. + +## Risks / Trade-offs + +- [Late dependency discovery means graph snapshots can lag runtime progress] → Treat `exec/edges/*` and `exec/state/*` as eventually complete during running execution; planners should refresh before computing closures. +- [State-object updates may conflict when multiple actors touch the same execution summary] → Require compare-and-swap writes and monotone merges so retries are deterministic. +- [Deleting cache refs after invalidation can race with later successful reruns] → Invalidation is execution-targeted; deleting refs is conditioned on their recorded `execution_id` belonging to the invalidated set. +- [The new remote layout is incompatible with the prior cache-key lineage layout] → Treat this as a clean replacement and remove all prior-path reads and writes in the same change, including any reads from `calls/from/...` and `calls/to/...`. + +## Migration Plan + +1. Implement the execution-id remote layout under `exec/state/*`, `exec/edges/*`, `exec/invalidate/*`, and `refs/cache/*`. +2. Update runtime writers and readers to use only the new execution-id layout. +3. Update local/admin tooling to ingest the new S3 layout into a local database and compute invalidation/cancellation closures from execution ids. + +Rollback strategy: +- None. This change is a clean replacement and does not preserve a backward-compatible path. + +## Open Questions + +None. diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/proposal.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/proposal.md new file mode 100644 index 0000000..1f2c361 --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/proposal.md @@ -0,0 +1,29 @@ +## Why + +The current remote execution lineage is keyed by cache key and maintained through multiple mutable forward and reverse indexes. That makes cache invalidation and cancellation propagation difficult to reason about, especially when executions are retried and cache keys are reused across attempts. + +## What Changes + +- This change explicitly rejects any backward compatibility with the existing cache-key lineage model. It is a clean, new implementation and SHALL replace the prior remote layout wholesale. +- Move remote execution lineage from cache-key-based call indexes to execution-id-based edge records. +- Change remote cache refs so each cache key remains a proper ref while recording the current execution id for that computation in ref metadata. +- Use a single mutable execution object under `exec/state/` to hold adapter state, lifecycle status, execution timestamps, and discovered dependencies. +- Add remote invalidation markers keyed by execution id and define a cache invalidation algorithm that computes caller closure locally, writes invalidation markers, and removes cache refs that point at invalidated executions. +- Drive cancellation through mutable execution `state/` and define a cancellation algorithm that propagates through execution dependencies when a downstream execution has no remaining live callers. +- Nothing in the new implementation reads from `calls/from/...` or `calls/to/...`; those paths are fully unsupported. +- **BREAKING** Replace the existing cache-key call-edge S3 layout (`calls/from/...`, `calls/to/...`) with execution-id-based storage under `exec/edges/`, `exec/state/`, and `exec/invalidate/`. + +## Capabilities + +### New Capabilities +- `execution-admin-controls`: Manual invalidation and cancellation controls over the execution graph, including closure-based propagation rules and required S3 markers. + +### Modified Capabilities +- `execution-call-edges`: Change lineage persistence from cache-key forward/reverse indexes to execution-id edge records stored by callee execution. +- `runtime-execution-records`: Replace the split record/live model with a single execution object, change cache publication semantics so cache refs record current execution ids while remaining proper refs, and define execution state used by graph planning. + +## Impact + +- Affected code: runtime execution coordination, cache publication/deletion, remote storage layout, CLI/admin flows for cache invalidation and cancellation, and local graph-planning tooling. +- Affected systems: S3-backed execution metadata and any consumers of call-edge lineage. +- Affected APIs/data: remote object layout under `exec/` and `refs/cache/`; prior cache-key call-edge objects are not preserved or supported. diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-admin-controls/spec.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-admin-controls/spec.md new file mode 100644 index 0000000..3754b72 --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-admin-controls/spec.md @@ -0,0 +1,102 @@ +## ADDED Requirements + +### Requirement: Manual invalidation SHALL target execution identity +The system SHALL treat cache invalidation as an execution-graph operation. When a user requests invalidation for a cache key, the system SHALL resolve the current execution id from `refs/cache/.json`, compute the reverse caller closure over execution dependencies in a local planning database, and invalidate that execution set. + +The invalidation algorithm SHALL operate as follows: + +1. Initialize `seen = []`, `seen_set = set()`, and `unseen = set()`. +2. For each user-provided cache key, read `refs/cache/.json`. +3. If that cache ref exists, add its `execution_id` to `unseen`. +4. While `unseen` is not empty: +5. Remove one `exec_id` from `unseen`. +6. Read `exec/state/.json`; if it does not exist, continue. +7. Read `cache_key` from that execution state object. +8. Read `refs/cache/.json`; if it does not exist, continue. +9. If that cache ref points to a different `execution_id`, continue. +10. Append `exec_id` to `seen` and add it to `seen_set`. +11. Read callers of `exec_id` from `exec/edges//`. +12. Add `(callers - seen_set)` to `unseen`. +13. After `unseen` is empty, iterate `exec_id` through `reversed(seen)`. +14. For each `exec_id`, write `exec/invalidate/.json` with create-once/CAS semantics. +15. Then delete `refs/cache/.json` with compare-and-swap semantics only if it still points to `exec_id`. + +#### Scenario: Invalidate starts from current cache ref +- **WHEN** a user invalidates cache key `ck1` +- **THEN** the system SHALL read `refs/cache/ck1.json` to determine the current root execution id before planning propagation + +#### Scenario: Historical execution is skipped when cache ref moved +- **WHEN** `exec/state/e1.json` exists but `refs/cache/ck1.json` now points to `e2` instead of `e1` +- **THEN** invalidation SHALL skip `e1` +- **AND** it SHALL NOT add callers of `e1` to the invalidation closure + +### Requirement: Invalidation SHALL write execution tombstones and drop affected cache refs +For every execution id in the invalidation closure, the system SHALL write `exec/invalidate/.json` as an immutable control marker containing `execution_id`, `cache_key`, `requested_by`, and `requested_at`. After planning completes, the system SHALL delete every cache ref whose recorded `execution_id` is in that invalidated set. + +The invalidate tombstone schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `requested_by: str` +- `requested_at: int` + +#### Scenario: Invalidation writes control markers and removes cache pointers +- **WHEN** the local planner computes invalidation closure `A` +- **THEN** the system SHALL create `exec/invalidate/.json` for every execution in `A` +- **AND** it SHALL delete each `refs/cache/.json` whose stored `execution_id` belongs to `A` + +#### Scenario: Cache ref delete is guarded by compare-and-swap +- **WHEN** invalidation reaches commit for execution `e1` +- **AND** `refs/cache/ck1.json` no longer points to `e1` +- **THEN** the system SHALL NOT delete that cache ref + +#### Scenario: Invalidation tombstone stores requester metadata +- **WHEN** the system writes `exec/invalidate/e1.json` +- **THEN** that object SHALL contain `execution_id`, `cache_key`, `requested_by`, and `requested_at` + +### Requirement: Manual cancellation SHALL target execution identity +The system SHALL treat cancellation as an execution-graph operation keyed by execution id. A user cancellation request SHALL update `exec/state/.json` so that the execution transitions to `cancel-requested` and records `cancel_requested_by`. + +The cancellation algorithm SHALL operate as follows: + +1. Initialize `seen = []`, `seen_set = set()`, and `unseen = set(user_requested_exec_ids)`. +2. While `unseen` is not empty: +3. Remove one `exec_id` from `unseen`. +4. Read `exec/state/.json`; if it does not exist, continue. +5. If `status` is `succeeded`, `failed`, or `cancelled`, continue. +6. Append `exec_id` to `seen` and add it to `seen_set`. +7. Add `(dependencies - seen_set)` from that state object to `unseen`. +8. After `unseen` is empty, iterate `exec_id` through `reversed(seen)`. +9. For each `exec_id`, reread `exec/state/.json`; if it does not exist, continue. +10. If `status` is `succeeded`, `failed`, or `cancelled`, continue. +11. Count callers of `exec_id` from `exec/edges//` whose state exists and whose `status` is not `cancel-requested`, `cancelled`, `succeeded`, or `failed`. +12. If that uncancelled caller count is greater than `1`, continue. +13. Otherwise update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user. + +#### Scenario: Direct cancellation updates live execution summary +- **WHEN** a user cancels execution `e1` +- **THEN** the system SHALL update `exec/state/e1.json` so that `status = "cancel-requested"` +- **AND** `cancel_requested_by` identifies the requesting user + +#### Scenario: Dependency is cancel-requested before caller +- **WHEN** execution `e1` depends on execution `e2` +- **THEN** the cancellation commit phase SHALL process `e2` before `e1` + +### Requirement: Cancellation propagation SHALL stop when a callee still has a live caller +The local planner SHALL propagate cancellation only across non-terminal execution dependencies. It SHALL stop recursing when it reaches a terminal execution. Among non-terminal executions in the dependency closure, it SHALL request cancellation only when a candidate execution has no remaining live callers after accounting for callers already included in the cancelling closure. + +#### Scenario: Shared dependency is preserved while another caller remains live +- **WHEN** execution `e2` depends on `e3` and a different live execution `e4` also depends on `e3` +- **THEN** cancelling `e2` SHALL NOT require `e3` to be cancelled while `e4` remains a live caller + +#### Scenario: Uncancelled caller count greater than one blocks cancellation +- **WHEN** execution `e3` has two uncancelled callers +- **THEN** the cancellation algorithm SHALL skip `e3` for that iteration + +#### Scenario: Sole dependency is cancelled recursively +- **WHEN** execution `e2` depends on `e3` and `e2` is the only live caller of `e3` +- **THEN** cancelling `e2` SHALL cause the planner to mark `e3` for cancellation as part of the propagated closure + +#### Scenario: Terminal dependency is not cancelled +- **WHEN** execution `e2` depends on execution `e3` and `e3` is already terminal +- **THEN** cancelling `e2` SHALL NOT request cancellation for `e3` diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-call-edges/spec.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-call-edges/spec.md new file mode 100644 index 0000000..6f3a553 --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/execution-call-edges/spec.md @@ -0,0 +1,42 @@ +## REMOVED Requirements + +### Requirement: Runtime SHALL distinguish user-dags from fn-dags for call-edge tracking +**Reason**: The execution graph now uses execution ids as the only lineage node identity. +**Migration**: Remove cache-key and index-based lineage writes entirely and derive graph roots only from cache refs and execution state objects. No backward-compatible reads or writes to the prior lineage layout are supported, and nothing reads from `calls/from/...` or `calls/to/...`. + +### Requirement: Runtime SHALL persist forward call-edge indexes by caller type +**Reason**: Forward lineage is no longer stored as mutable cache-key arrays segmented by caller type. +**Migration**: Replace prior forward lineage storage with canonical execution-edge records at `exec/edges//.json`. No backward-compatible forward lineage path remains, and `calls/from/...` is never read. + +### Requirement: Runtime SHALL persist reverse call-edge indexes for callee cache keys +**Reason**: Reverse lineage is no longer stored as shared mutable cache-key index objects. +**Migration**: Replace prior reverse lineage reads with `exec/edges//`. No backward-compatible reverse lineage path remains, and `calls/to/...` is never read. + +### Requirement: Call-edge index updates SHALL be concurrency-safe and canonicalized +**Reason**: Edge records become immutable create-once objects rather than merge-updated JSON indexes. +**Migration**: Replace prior merge-updated lineage objects with idempotent create semantics for canonical edge-object paths. No backward-compatible lineage-update path remains, and no code path consults `calls/from/...` or `calls/to/...`. + +## MODIFIED Requirements + +### Requirement: Call-edge records SHALL represent realized execution dependencies +The runtime SHALL record only realized execution dependencies. An edge SHALL mean that caller execution `caller_execution_id` was observed to depend on callee execution `callee_execution_id` during runtime execution, even if that dependency is discovered during a later `start_fn` poll cycle. + +#### Scenario: Dependency discovered after initial launch still creates edge +- **WHEN** execution `e0` does not know about callee `e1` on its first poll but discovers that dependency on a later poll +- **THEN** the runtime SHALL create the edge record for `e1 <- e0` when that dependency becomes known + +#### Scenario: Repeated observation does not require a second edge fact +- **WHEN** execution `e0` rediscovers an existing dependency on `e1` +- **THEN** the runtime SHALL continue to treat `e1 <- e0` as one canonical edge fact + +### Requirement: Runtime SHALL persist canonical edge records by callee execution id +The runtime SHALL persist each execution dependency as the immutable object `exec/edges//.json`. The payload SHALL include only `caller_execution_id` and `callee_execution_id`. + +#### Scenario: Edge record is written at canonical path +- **WHEN** execution `e0` discovers a dependency on execution `e1` +- **THEN** the runtime SHALL write `exec/edges/e1/e0.json` +- **AND** that object SHALL contain JSON with `caller_execution_id = "e0"` and `callee_execution_id = "e1"` + +#### Scenario: Reverse lineage query lists callers by callee execution id +- **WHEN** an invalidation planner needs all callers of execution `e1` +- **THEN** it SHALL obtain them by reading the objects under `exec/edges/e1/` diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..a9bda79 --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/specs/runtime-execution-records/spec.md @@ -0,0 +1,117 @@ +## MODIFIED Requirements + +### Requirement: Runtime SHALL separate cache identity from execution identity +The runtime SHALL treat `cache_key` as the stable computation identity and `execution_id` as the stable identity of one execution attempt. The runtime SHALL acquire execution locks by `cache_key`, SHALL propagate `execution_id` in the adapter envelope, and SHALL use execution id as the identity for dependency edges, execution state objects, and invalidation records. + +#### Scenario: First launch creates a new execution identity +- **WHEN** `start_fn` observes a cache miss and confirms there is no active execution for the computed `cache_key` +- **THEN** it creates a new `execution_id` for that launch attempt +- **AND** it invokes the adapter with both `cache_key` and `execution_id` + +#### Scenario: Resume preserves the current execution identity +- **WHEN** `start_fn` observes an active execution for a `cache_key` +- **THEN** it SHALL reuse the referenced `execution_id` +- **AND** it SHALL NOT create a new `execution_id` for that execution while resuming it + +### Requirement: Runtime SHALL maintain an active execution pointer per cache key +The runtime SHALL persist the currently active execution for a `cache_key` at `active/` as plain text containing only the `execution_id`. + +#### Scenario: Active pointer is created for a new running execution +- **WHEN** the first adapter call for a new execution returns `running` +- **THEN** the runtime SHALL create `active/` containing that execution's `execution_id` + +#### Scenario: Stale active pointer is discarded +- **WHEN** `active/` exists but `exec/state/.json` does not exist +- **THEN** the runtime SHALL delete `active/` +- **AND** it SHALL treat the cache key as having no active execution + +### Requirement: Runtime SHALL maintain one mutable execution object per execution id +The runtime SHALL persist `exec/state/.json` as the single compare-and-swap updated execution object for that execution. That object SHALL include `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by`, where `cancel_requested_by` is `str | null`. `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed`. `state` SHALL contain the durable adapter state returned by the first adapter call for that execution and SHALL be `null` when no durable adapter state exists. Once `state` is first written for an execution, the runtime SHALL NOT replace or merge it on later updates. `dependencies` SHALL be the deduped set of discovered callee execution ids for that execution. Execution-object updates SHALL be monotone: newly discovered dependencies MAY be added, terminal status MAY replace non-terminal status, `cancel-requested` MAY precede `cancelled`, and existing dependencies SHALL NOT be removed. + +The execution-object schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `created_at: int` +- `status: "running" | "cancel-requested" | "cancelled" | "succeeded" | "failed"` +- `state: object | null` +- `dependencies: list[str]` +- `updated_at: int` +- `cancel_requested_by: str | null` + +#### Scenario: First adapter call creates the execution object +- **WHEN** the first adapter call for a new execution returns any valid adapter result +- **THEN** the runtime SHALL create `exec/state/.json` +- **AND** that object SHALL contain the returned adapter `state` when one exists + +#### Scenario: First execution object records creation time +- **WHEN** the runtime first creates `exec/state/.json` +- **THEN** that object SHALL contain `created_at` +- **AND** `created_at` SHALL remain unchanged on later updates + +#### Scenario: Resume uses stored execution state +- **WHEN** `start_fn` resumes an active execution +- **THEN** it SHALL load the adapter `state` from `exec/state/.json` +- **AND** it SHALL pass that stored state to the adapter + +#### Scenario: Later running result does not replace stored execution state +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with durable `state` +- **THEN** the runtime SHALL keep the existing stored `state` in `exec/state/.json` + +#### Scenario: Late dependency discovery expands execution summary +- **WHEN** execution `e0` later discovers a dependency on execution `e1` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `dependencies` contains `e1` + +#### Scenario: Dependency merge survives compare-and-swap retry +- **WHEN** a compare-and-swap update to `exec/state/e0.json` observes a conflicting write +- **THEN** the runtime SHALL reread, merge the dependency set and monotone status fields, and retry the conditional write + +#### Scenario: Cancellation requester is recorded on cancel request +- **WHEN** a user requests cancellation for execution `e0` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `status = "cancel-requested"` +- **AND** `cancel_requested_by` contains the requesting user identity + +#### Scenario: Execution object includes minimal execution fields +- **WHEN** the runtime persists `exec/state/e0.json` +- **THEN** that object SHALL contain `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by` + +#### Scenario: Execution object rejects unknown status values +- **WHEN** the runtime validates or persists `exec/state/e0.json` +- **THEN** `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed` + +### Requirement: Cache refs SHALL remain proper refs and record execution ids +The runtime SHALL publish `refs/cache/.json` as a normal cache ref to the current manifest for that cache key, and that ref SHALL also record `execution_id` for the current execution. Readers that materialize cached results SHALL continue resolving the cached manifest through the ref target, and graph planners SHALL read `execution_id` from the same cache ref. + +#### Scenario: Successful execution updates cache pointer +- **WHEN** execution `e7` becomes the terminal cached result for cache key `ck1` +- **THEN** the runtime SHALL write `refs/cache/ck1.json` with `execution_id = "e7"` +- **AND** that object SHALL remain a valid cache ref with its manifest `target` + +#### Scenario: Re-run requires prior invalidation +- **WHEN** a later execution `e8` attempts to publish a terminal cached result for cache key `ck1` +- **AND** `refs/cache/ck1.json` already exists for an earlier execution +- **THEN** the runtime SHALL reject that cache publication +- **AND** the earlier cache ref MUST be invalidated or deleted before `e8` can publish `refs/cache/ck1.json` + +### Requirement: Adapter envelope and result schema SHALL follow the runtime-owned execution contract +The adapter envelope SHALL include `argv_ptr`, `cache_key`, `execution_id`, `remote`, `runnable`, and `state`. The adapter result SHALL use only `running`, `succeeded`, or `failed` statuses. `running` MUST include durable `state`. `succeeded` MUST include `dag_id`. `failed` MUST include `error`. + +#### Scenario: First adapter call uses null state +- **WHEN** the runtime invokes an adapter for a new execution +- **THEN** the adapter envelope SHALL include `state = null` + +#### Scenario: Later adapter state is ignored after first write +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with a different `state` +- **THEN** the runtime SHALL continue using the existing stored `state` from `exec/state/.json` + +### Requirement: Failed execution SHALL be cached as a terminal result +If an adapter returns `failed`, the runtime SHALL complete the DAG with the error and SHALL publish that failed terminal outcome to cache for the `cache_key`. + +#### Scenario: Failed adapter result populates cache +- **WHEN** an adapter returns `failed` for a cache key +- **THEN** the runtime SHALL complete the DAG with the reported error +- **AND** it SHALL publish the failed outcome into cache for that cache key + +#### Scenario: Failed execution clears active pointer +- **WHEN** an active execution returns `failed` +- **THEN** the runtime SHALL delete `active/` before surfacing the failure diff --git a/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/tasks.md b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/tasks.md new file mode 100644 index 0000000..79d7b19 --- /dev/null +++ b/openspec/changes/archive/2026-05-07-execution-id-graph-invalidation/tasks.md @@ -0,0 +1,26 @@ +## 1. Remote Model Replacement + +- [x] 1.1 Replace cache-key call-edge writes with canonical edge objects at `exec/edges//.json` +- [x] 1.2 Replace the split record/live model with a single mutable execution object at `exec/state/.json`, including `created_at` +- [x] 1.3 Change cache refs so `refs/cache/.json` remains a proper ref while recording the current execution id for that cache key +- [x] 1.4 Add immutable invalidation tombstones at `exec/invalidate/.json` and include requester metadata + +## 2. Runtime Execution Updates + +- [x] 2.1 Update `start_fn` and related runtime paths to preserve execution-id identity across first launch and resume +- [x] 2.2 Implement compare-and-swap updates for `exec/state/.json` with monotone merges of `status`, `dependencies`, and `cancel_requested_by` while preserving `created_at` and the first-written adapter `state` +- [x] 2.3 Write edge objects when dependencies are concretely discovered during execution, including late discovery on later poll cycles +- [x] 2.4 Remove all reads and writes for the prior cache-key `calls/from/...` and `calls/to/...` lineage layout + +## 3. Local Planning And Admin Operations + +- [x] 3.1 Build local ingestion of `exec/state`, `exec/edges`, `exec/invalidate`, and `refs/cache` into a queryable local database +- [x] 3.2 Implement cache invalidation planning with the `seen`/`unseen` traversal, current-cache guard, create-once invalidation tombstones, and compare-and-swap cache-ref deletes +- [x] 3.3 Implement cancellation planning with forward dependency traversal, terminal-state pruning, reverse commit order, and uncancelled-caller counting before setting `cancel-requested` +- [x] 3.4 Update CLI or admin entry points to use the new local planning flow for invalidation and cancellation + +## 4. Verification + +- [x] 4.1 Add or update runtime tests that cover first-call state creation, resume reusing the stored adapter state, late dependency discovery, `created_at` preservation, cancellation requester recording, and canonical edge-object writes +- [x] 4.2 Add or update admin-planning tests for cache invalidation closure, current-cache guarding, CAS-protected cache-ref deletion, and cancellation propagation with shared, sole, and terminal dependencies +- [x] 4.3 Add tests that prove the new execution-id flows work without consulting `calls/from/...` or `calls/to/...`, for example by exercising the new layout while those old paths are absent or stale diff --git a/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/.openspec.yaml b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/.openspec.yaml new file mode 100644 index 0000000..054b8c0 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-08 diff --git a/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/design.md b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/design.md new file mode 100644 index 0000000..bf627fb --- /dev/null +++ b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/design.md @@ -0,0 +1,76 @@ +## Context + +`daggerml.contrib.supervisor` currently launches a worker subprocess, redirects worker `stdout` and `stderr` to local files, waits for exit, and returns a terminal result based on `result.json` or the worker exit status. That shape is useful and should remain intact, but it provides no durable live observability for long-running or remote executions because the logs exist only in a temporary workdir and are removed during script-executor cleanup. + +The repository already depends on `boto3`, so CloudWatch Logs integration can be added without introducing a new package or changing the runtime ownership model. The user requirement also calls for safe fallback behavior: log shipping must never become part of execution correctness. + +## Goals / Non-Goals + +**Goals:** +- Preserve the current supervisor/executor control flow and terminal result contract. +- Stream worker `stdout` and `stderr` to CloudWatch Logs while the worker is still running. +- Keep writing local `stdout.log` and `stderr.log` files for fallback and local debugging. +- Emit start and end lifecycle messages that include `execution_id`, `cache_key`, stream kind, and terminal status where applicable. +- Make CloudWatch failures best-effort so worker execution still completes normally when log shipping fails. + +**Non-Goals:** +- Changing adapter output, result publication, or polling semantics. +- Introducing a new dependency such as `watchtower` or an external `aws` CLI requirement. +- Generalizing this change to every executor in the same change. +- Making CloudWatch configuration dynamically user-selectable beyond the fixed group and stream naming required by this change. +- Adding a backward-compatibility code path, legacy supervisor logging mode, or stream-name shim layer. + +## Decisions + +### Use the supervisor as the CloudWatch log owner +The supervisor already owns worker process launch, local log capture, and terminal result interpretation. Extending it to own CloudWatch streaming keeps observability close to the existing process boundary and avoids spreading log-shipping concerns into polling or higher runtime layers. + +Alternative considered: ship logs from the script executor during `poll()`. Rejected because polling is intermittent rather than continuous, complicates state ownership, and cannot provide true live streaming. + +### Read worker `stdout` and `stderr` through pipes and tee them to local files plus CloudWatch +To stream logs while the worker runs, the supervisor should launch the worker with `stdout=PIPE` and `stderr=PIPE`, consume each pipe concurrently, append the bytes to the existing local log files, and batch line-oriented CloudWatch events for the corresponding stream. + +This replaces the existing direct worker-to-file redirection path in the supervisor rather than preserving a separate legacy implementation branch. Local file capture remains part of the single active path, not a compatibility mode. + +Alternative considered: keep file redirection and tail the files from background threads. Rejected because it adds file-offset bookkeeping and weaker real-time behavior without simplifying the code meaningfully. + +### Use one CloudWatch log stream per output channel +The log group is fixed to `dml`, and the stream names are fixed to `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr`. Separate streams preserve channel identity without interleaving rules or merged timestamps. + +The implementation should use those exact stream names and should not introduce a sanitization or aliasing shim. If a computed stream name cannot be used with CloudWatch as-is, CloudWatch delivery for that channel should fail safely and local log capture should continue. + +Alternative considered: a single merged log stream. Rejected because preserving stdout/stderr separation would require extra envelope data for every message and would make direct CloudWatch inspection harder. + +### Emit explicit lifecycle events at the beginning and end of each stream +Each stream should begin with a metadata event describing the execution and stream kind, and end with a metadata event describing the same execution plus terminal status. These events provide stable anchors even when the worker itself emits no output. + +Alternative considered: rely only on raw worker output. Rejected because silent workers would produce no CloudWatch evidence that streaming was configured or completed. + +### Make CloudWatch delivery best-effort and self-disabling on failure +If CloudWatch client creation, log group/stream setup, or `put_log_events` fails, the supervisor should record the problem locally and continue writing worker output to local files. Repeated CloudWatch failures for a stream should disable further CloudWatch writes for that stream instead of repeatedly failing in the hot path. + +Alternative considered: fail the supervisor when CloudWatch initialization fails. Rejected because it would turn observability into a correctness dependency and violate the fallback requirement. + +## Risks / Trade-offs + +- CloudWatch stream writes add thread and batching complexity to the supervisor -> Keep the implementation narrow: one reader thread per pipe, one CloudWatch sink per stream, and no changes to executor polling semantics. +- `cache_key` may contain characters or length patterns that CloudWatch stream names may reject -> Do not add a name-rewriting shim; let CloudWatch delivery disable itself for that channel and preserve local log capture. +- Buffered or partial worker output may not align perfectly with line boundaries -> Buffer partial lines in the reader thread and flush any remainder on EOF. +- CloudWatch API throttling or transient failures can drop streamed logs -> Keep logging best-effort, flush final buffered events on shutdown, and preserve the local files as the fallback record. +- Start and end lifecycle messages may be duplicated if retry logic is too broad -> Make lifecycle emission stream-local and idempotent within a single supervisor run. + +## Migration Plan + +No persisted data migration is required. + +Implementation rollout is additive: +1. Add supervisor-side CloudWatch streaming in the single supervisor launch path defined by this change. +2. Extend integration and unit coverage for local fallback, lifecycle events, and non-fatal CloudWatch failures. +3. Update contrib runtime docs to describe the new observability behavior. + +Rollback is straightforward: remove or disable the supervisor CloudWatch streaming path and retain the existing local log-file capture behavior. + +## Open Questions + +- What exact event message shape should be used for lifecycle messages: plain text with embedded metadata or compact JSON payloads. +- Whether the final terminal metadata event should include the supervisor return classification only, or also worker exit-code/signal details when available. diff --git a/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/proposal.md b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/proposal.md new file mode 100644 index 0000000..c25b0e0 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/proposal.md @@ -0,0 +1,27 @@ +## Why + +Supervisor-backed script executions currently capture worker `stdout` and `stderr` only in local temporary files. That makes live debugging difficult for remote or long-running executions, and the logs disappear when the executor cleans up its workdir. + +## What Changes + +- Add best-effort CloudWatch Logs streaming for supervisor-managed worker `stdout` and `stderr`. +- Keep the current supervisor/executor structure: the supervisor still launches the worker, captures local log files, waits for terminal completion, and returns the same terminal result contract. +- Stream logs concurrently while the worker runs so polling and execution progress can be observed in near real time. +- Use log group `dml` and per-run log streams `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr`. +- Emit structured lifecycle messages at stream start and stream end containing execution metadata such as `execution_id`, `cache_key`, and terminal status. +- Make CloudWatch failures non-fatal and fall back safely to local `stdout`/`stderr` capture. +- Implement the change as a single supervisor path with no backward-compatibility branch, no legacy logging mode, and no name-rewriting shim for CloudWatch stream names. + +## Capabilities + +### New Capabilities +- `supervisor-cloudwatch-streaming`: Best-effort streaming of supervisor-managed worker `stdout` and `stderr` to CloudWatch Logs with start/end lifecycle messages and safe fallback behavior. + +### Modified Capabilities + +## Impact + +- Affected code: `src/daggerml/contrib/supervisor.py`, `src/daggerml/contrib/executors/script.py`, and related contrib integration tests. +- Affected docs/specs: contrib runtime and executor behavior docs, plus a new OpenSpec capability for supervisor log streaming. +- Dependencies: no new package dependencies; implementation uses the existing `boto3` runtime dependency. +- Systems: AWS CloudWatch Logs for observability, with unchanged execution correctness when CloudWatch is unavailable or misconfigured. diff --git a/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/specs/supervisor-cloudwatch-streaming/spec.md b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/specs/supervisor-cloudwatch-streaming/spec.md new file mode 100644 index 0000000..ae5b6fd --- /dev/null +++ b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/specs/supervisor-cloudwatch-streaming/spec.md @@ -0,0 +1,49 @@ +## ADDED Requirements + +### Requirement: Supervisor streams worker stdout and stderr to CloudWatch Logs +The supervisor SHALL stream worker `stdout` and `stderr` to AWS CloudWatch Logs while the worker process is still running, in addition to preserving local `stdout.log` and `stderr.log` files. + +#### Scenario: Stdout is streamed while the worker runs +- **WHEN** the supervisor starts a worker that writes to `stdout` +- **THEN** the supervisor writes the output to the local `stdout.log` file and publishes the same output to CloudWatch Logs before the worker exits + +#### Scenario: Stderr is streamed while the worker runs +- **WHEN** the supervisor starts a worker that writes to `stderr` +- **THEN** the supervisor writes the output to the local `stderr.log` file and publishes the same output to CloudWatch Logs before the worker exits + +### Requirement: Supervisor uses fixed CloudWatch log destinations per run +The supervisor SHALL publish worker logs to log group `dml` and SHALL use exactly two log streams named `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr` for the corresponding worker output channels. + +#### Scenario: Stdout stream name is derived from cache key +- **WHEN** the supervisor launches a worker for a given `cache_key` +- **THEN** worker `stdout` events are published to CloudWatch log stream `/run/{cache_key}/stdout` in log group `dml` + +#### Scenario: Stderr stream name is derived from cache key +- **WHEN** the supervisor launches a worker for a given `cache_key` +- **THEN** worker `stderr` events are published to CloudWatch log stream `/run/{cache_key}/stderr` in log group `dml` + +#### Scenario: Supervisor does not rewrite stream names +- **WHEN** the supervisor computes CloudWatch stream names from `cache_key` +- **THEN** it uses the exact names `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr` without a compatibility alias or sanitization shim + +### Requirement: Supervisor emits lifecycle metadata at stream start and end +The supervisor SHALL emit a lifecycle event to each CloudWatch log stream when streaming begins and another lifecycle event when streaming ends. Lifecycle events SHALL include `execution_id`, `cache_key`, the stream kind (`stdout` or `stderr`), and the terminal status when streaming ends. + +#### Scenario: Start lifecycle event is emitted before worker output +- **WHEN** the supervisor initializes CloudWatch streaming for a worker output channel +- **THEN** it first publishes a lifecycle event containing the execution metadata for that channel before publishing worker output events + +#### Scenario: End lifecycle event is emitted after worker exit +- **WHEN** the worker process has exited and the supervisor has determined the terminal result +- **THEN** it publishes a lifecycle event containing the execution metadata and terminal status for each channel before closing CloudWatch streaming + +### Requirement: CloudWatch failures do not fail worker execution +CloudWatch client, log-stream, or event-delivery failures SHALL be non-fatal to execution. When CloudWatch streaming fails, the supervisor SHALL continue capturing worker output locally and SHALL continue evaluating the worker terminal result using the existing supervisor result contract. + +#### Scenario: CloudWatch initialization fails +- **WHEN** the supervisor cannot initialize CloudWatch logging for a worker output channel +- **THEN** the supervisor continues the worker run, preserves local log-file capture, and still returns the worker terminal result normally + +#### Scenario: CloudWatch delivery fails after streaming has started +- **WHEN** CloudWatch event delivery fails during an active worker run +- **THEN** the supervisor continues capturing output locally for the rest of the run and still returns the worker terminal result normally diff --git a/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/tasks.md b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/tasks.md new file mode 100644 index 0000000..93ec024 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-stream-supervisor-cloudwatch-logs/tasks.md @@ -0,0 +1,16 @@ +## 1. Supervisor CloudWatch streaming + +- [x] 1.1 Add supervisor-side CloudWatch logging helpers that target log group `dml` and streams `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr` using the existing `boto3` dependency. +- [x] 1.2 Change the supervisor worker launch path to read `stdout` and `stderr` from pipes, tee each channel into the existing local log files, and stream each channel concurrently while the worker runs, without keeping a legacy alternate path. +- [x] 1.3 Emit per-stream lifecycle messages at startup and shutdown that include `execution_id`, `cache_key`, stream kind, and terminal status, and make CloudWatch failures self-disabling and non-fatal without adding compatibility aliases or stream-name shims. + +## 2. Runtime behavior and tests + +- [x] 2.1 Preserve the existing supervisor terminal result behavior while ensuring CloudWatch streaming flushes and shuts down cleanly after worker exit. +- [x] 2.2 Add or update supervisor and script-executor tests for combined live stdout/stderr capture and local-file preservation. +- [x] 2.3 Add failure-path tests covering CloudWatch initialization or delivery errors to verify execution still succeeds or fails based only on the worker terminal result. + +## 3. Documentation and verification + +- [x] 3.1 Update the contrib runtime and executor docs to describe supervisor-managed CloudWatch log streaming and its best-effort fallback behavior. +- [x] 3.2 Run the targeted test coverage for supervisor and contrib script execution paths, and fix any regressions introduced by the streaming change. diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/.openspec.yaml b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/.openspec.yaml new file mode 100644 index 0000000..0478d8f --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-09 diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/design.md b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/design.md new file mode 100644 index 0000000..3074951 --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/design.md @@ -0,0 +1,61 @@ +## Context + +The shared configuration model already defines canonical parameter names such as `project.home`, `remote.uri`, and `config_home`, and the CLI delegates config resolution to `DmlConfig`. Even so, the top-level parser and some help/error surfaces still expose older frontend-specific spellings like `--repo` and `--remote-root`. The change is cross-cutting because it touches parser setup, help examples, normalized error hints, and CLI-focused tests across multiple command paths. + +The concrete CLI files affected by the audit are `src/daggerml/_cli/__init__.py`, `base.py`, `init.py`, `status.py`, `config.py`, and `remote.py`. Other CLI modules do not currently hardcode the legacy flag names, so they are expected to remain unchanged unless verification uncovers stale help text. + +## Goals / Non-Goals + +**Goals:** +- Rename user-facing CLI flags so explicit overrides mirror the canonical config keys they feed. +- Keep the CLI thin by changing parser names and forwarded argument attributes without moving domain logic into CLI modules. +- Update help text, examples, and structured error hints so the docs and runtime guidance use one naming scheme. +- Update tests so contract coverage reflects the renamed public surface. + +**Non-Goals:** +- Changing the underlying config schema, resolution precedence, or environment variable names. +- Adding compatibility aliases for the old flag spellings. +- Renaming Python API arguments such as `Dml(repo=...)` as part of this change. + +## Decisions + +### Rename only CLI-facing flags, not canonical internal fields +The implementation will rename parser options like `--repo` to `--project-home` and `--remote-root` to `--remote-uri`, while continuing to resolve those values through `DmlConfig` as `project.home` and `remote.uri`. This keeps the internal contract unchanged and limits the change to the transport surface. + +Alternative considered: rename internal config fields or add a second normalization layer in CLI code. That was rejected because the canonical internal names are already established in docs and code, and extra translation logic would weaken the thin-interface contract. + +### Treat the rename as an intentional breaking CLI update +The old spellings will be removed rather than kept as aliases. This matches the request to make CLI flags replicas of config args throughout and avoids indefinite dual-name maintenance in help text, tests, and user guidance. + +Alternative considered: keep both old and new flags for a deprecation period. That was rejected because it would preserve the naming ambiguity this change is trying to remove. + +### Preserve handler wiring and output behavior +Handlers will continue to forward parsed values into the same internal operations, with attribute names adjusted as needed to avoid business logic changes. Success payloads and structured error formatting stay the same except for user-facing hints that mention the renamed flags. + +Alternative considered: broaden the change into CLI argument refactoring or parser restructuring. That was rejected because the minimal correct change is a surface rename plus matching documentation and test updates. + +### Separate parser destinations where public flag names overlap +The top-level CLI should expose `--remote-uri`, and `init` should continue exposing its own `--remote-uri` input for project bootstrap. The implementation will keep the shared public spelling but use distinct argparse destinations so command execution can distinguish top-level runtime override input from init-specific remote configuration input. + +Alternative considered: rename one of the two public flags to avoid overlap. That was rejected because `remote.uri` is already the canonical config name for both concepts, and the requested change is to make CLI flags mirror config argument names throughout. + +## Risks / Trade-offs + +- [Existing scripts break on old flags] -> Mitigation: document the rename as breaking in the proposal/specs and update all in-repo examples/tests in the same change. +- [Some help text or error hints keep stale names] -> Mitigation: update shared helper messages and grep CLI/tests/docs for `--repo` and `--remote-root` before concluding the implementation. +- [Parser dest renames could accidentally break command execution] -> Mitigation: keep the forwarding shape explicit in each handler and verify through CLI contract/integration tests. +- [Top-level and init `--remote-uri` values collide in the argparse namespace] -> Mitigation: assign separate internal destinations and add parser/command tests that cover both forms. + +## Migration Plan + +1. Rename the top-level and command-specific parser flags to canonical config-shaped names in `__init__.py`, `init.py`, `status.py`, and `remote.py`. +2. Update CLI handlers and shared helpers in `base.py`, `init.py`, `status.py`, and `config.py` to read the new parsed argument attributes. +3. Assign separate argparse destinations for the top-level and init `--remote-uri` flags. +4. Update help examples, docs, and normalized error hints. +5. Update CLI contract and integration tests to use the new flag names and ensure stale names are rejected. + +Rollback is straightforward: restore the prior parser option names and corresponding examples/messages if the renamed surface causes unacceptable breakage before release. + +## Open Questions + +- None. The requested direction is explicit: CLI flags should match the config argument names throughout. diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/proposal.md b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/proposal.md new file mode 100644 index 0000000..58bc8a2 --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/proposal.md @@ -0,0 +1,28 @@ +## Why + +The CLI still exposes flag names such as `--repo` and `--remote-root` even though the shared configuration model is defined in canonical names like `project.home` and `remote.uri`. This mismatch makes the CLI harder to learn, weakens the contract between docs and implementation, and causes help text, error hints, and tests to drift away from the configuration model they are supposed to expose. + +## What Changes + +- Rename CLI flags so user-facing option names mirror their canonical configuration keys where practical, including replacing `--repo` with `--project-home` and `--remote-root` with `--remote-uri`. +- Update command help text, examples, and normalized CLI error hints to use the canonical flag names. +- Update init command inputs and any other CLI-exposed overrides so they consistently use config-shaped names already defined by the shared resolver. +- Update contract and integration tests to cover the renamed flags and reject stale flag names once the rename lands. +- **BREAKING**: Remove the old CLI flag spellings where they conflict with the canonical config naming contract. + +## Capabilities + +### New Capabilities + + +### Modified Capabilities +- `cli-thin-interface`: The CLI surface will change its public flag names while preserving thin delegation behavior and output structure. +- `shared-internal-configuration`: CLI explicit-argument naming will align with the canonical config keys exposed by the shared resolver. + +## Impact + +- Affected code: `src/daggerml/_cli/__init__.py`, `base.py`, `init.py`, `status.py`, `config.py`, `remote.py`, related internal error messages, and CLI-facing docs. +- Affected tests: CLI contract and integration tests that parse or invoke the renamed flags. +- Affected users: anyone invoking the CLI with `--repo` or other non-canonical override names. +- Special implementation concern: the top-level `--remote-uri` rename will overlap with `init --remote-uri`, so parser destinations must remain distinct even if the public flag spelling is shared. +- No new runtime dependencies are expected. diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/cli-thin-interface/spec.md b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/cli-thin-interface/spec.md new file mode 100644 index 0000000..76059b7 --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/cli-thin-interface/spec.md @@ -0,0 +1,33 @@ +## ADDED Requirements + +### Requirement: CLI project and remote override flags use canonical config-shaped names +The CLI SHALL expose explicit project and remote override flags using the canonical configuration naming represented by the shared resolver, rather than frontend-specific aliases. + +#### Scenario: Top-level project override uses canonical name +- **WHEN** a user passes an explicit project directory to any command +- **THEN** the CLI accepts `--project-home ` as the top-level override flag +- **AND** the CLI does not advertise `--repo` as the supported flag name + +#### Scenario: Top-level remote override uses canonical name +- **WHEN** a user passes an explicit remote project URI to any command +- **THEN** the CLI accepts `--remote-uri ` as the top-level override flag +- **AND** the CLI does not advertise `--remote-root` as the supported flag name + +### Requirement: CLI guidance uses canonical flag names consistently +CLI help text, examples, and normalized user-facing recovery hints SHALL use the same canonical flag names as the parser surface. + +#### Scenario: Help examples show canonical overrides +- **WHEN** a user opens top-level or subcommand help for commands that mention explicit config overrides +- **THEN** the examples and help text refer to `--project-home` and `--remote-uri` instead of legacy aliases + +#### Scenario: Missing project-home hint uses canonical flag name +- **WHEN** command execution fails because no local project path can be resolved +- **THEN** the structured error hint instructs the user to pass `--project-home PATH` or set `DML_PROJECT_HOME` + +### Requirement: Shared public flag names do not create ambiguous CLI behavior +When the CLI uses the same canonical public flag spelling in different parser scopes, command dispatch SHALL preserve the intended meaning for each command path. + +#### Scenario: Init keeps its own remote-uri input without shadowing the top-level override +- **WHEN** the CLI exposes both a top-level `--remote-uri` option and `init --remote-uri` +- **THEN** parsing and command execution keep those inputs distinguishable +- **AND** `init` continues to forward its own `--remote-uri` value to bootstrap project remote configuration diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..4e274fc --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/specs/shared-internal-configuration/spec.md @@ -0,0 +1,18 @@ +## ADDED Requirements + +### Requirement: CLI explicit override names mirror canonical config parameters +The CLI SHALL name explicit configuration override flags after the canonical parameters they populate in the shared internal resolver whenever those parameters are exposed directly to users. + +#### Scenario: Project-home flag maps to canonical parameter +- **WHEN** the CLI resolves an explicit local project path override +- **THEN** it reads that value from a flag named after `project.home` +- **AND** it forwards the value into shared resolution as `project.home` + +#### Scenario: Remote-uri flag maps to canonical parameter +- **WHEN** the CLI resolves an explicit remote project override +- **THEN** it reads that value from a flag named after `remote.uri` +- **AND** it forwards the value into shared resolution as `remote.uri` + +#### Scenario: Existing canonical names remain unchanged +- **WHEN** the CLI exposes other explicit config-shaped overrides such as `--remote-project` or `--config-home` +- **THEN** those flags continue using the established canonical names rather than introducing alternate aliases diff --git a/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/tasks.md b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/tasks.md new file mode 100644 index 0000000..e8a4595 --- /dev/null +++ b/openspec/changes/archive/2026-05-09-align-cli-flags-with-config-args/tasks.md @@ -0,0 +1,18 @@ +## 1. Parser And Handler Updates + +- [x] 1.1 Rename top-level CLI override flags from `--repo` and `--remote-root` to `--project-home` and `--remote-uri` in `src/daggerml/_cli/__init__.py`. +- [x] 1.2 Update `src/daggerml/_cli/base.py`, `init.py`, `status.py`, and `config.py` so renamed parser destinations still resolve into canonical `project.home` and `remote.uri` values. +- [x] 1.3 Keep the public `--remote-uri` spelling in both top-level CLI parsing and `init`, but assign distinct argparse destinations so the two inputs do not collide. +- [x] 1.4 Update shared CLI error hints and command help text in `__init__.py`, `init.py`, `status.py`, and `remote.py` that still reference legacy flag names. + +## 2. Documentation And Examples + +- [x] 2.1 Update CLI examples and user-facing references in `docs/cli.md` and touched command epilog/help text to use `--project-home` and `--remote-uri`. +- [x] 2.2 Audit the repository for stale CLI references to `--repo` or `--remote-root` and replace the ones that describe the public CLI surface, including related internal error strings surfaced to CLI users. + +## 3. Verification + +- [x] 3.1 Update CLI contract tests to parse and execute the renamed flags, including coverage for top-level parser setup, `status`, `config`, and `init` entry points. +- [x] 3.2 Add test coverage for the dual `--remote-uri` surface so top-level overrides and `init --remote-uri` remain distinguishable. +- [x] 3.3 Update CLI integration tests that invoke init and full project lifecycle flows so they use the renamed flags throughout. +- [x] 3.4 Run the relevant CLI-focused test suite and confirm legacy flag references no longer appear in supported help or error guidance. diff --git a/openspec/changes/archive/2026-05-10-centralize-dml-resolution/.openspec.yaml b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/.openspec.yaml new file mode 100644 index 0000000..ac20efa --- /dev/null +++ b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-10 diff --git a/openspec/changes/archive/2026-05-10-centralize-dml-resolution/design.md b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/design.md new file mode 100644 index 0000000..73f54f6 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/design.md @@ -0,0 +1,76 @@ +## Context + +`src/daggerml/_internal/dml_resolution.py` already resolves revisions and basic DAG selectors, but `src/daggerml/_internal/dml.py` still contains node-specific lookup rules and wrapper methods that assemble resolution payloads. That split leaves selector parsing, ambiguity checks, and canonicalization spread across two modules even though they are part of the same concern: turning user-facing selectors into stable internal refs. + +This change needs a design document because the behavior crosses commit, DAG, and node lookup paths and should produce one shared resolution contract that DML callers can rely on. + +## Goals / Non-Goals + +**Goals:** +- Make `dml_resolution.py` the single home for fuzzy selector resolution logic used by DML. +- Provide explicit helpers for commit, DAG, and node resolution that always return canonical `Ref` objects for resolved entities. +- Preserve the existing ergonomic input forms where they are unambiguous, including direct refs, raw object ids in supported formats, and named lookups. +- Move `dml.py` to a thin orchestration role that calls shared resolution helpers instead of re-implementing selector parsing. + +**Non-Goals:** +- Redesign the CLI or user-facing output payload shapes beyond what is required to reflect the new canonical resolution behavior. +- Change storage layouts, ref encodings, or DAG/node persistence semantics. +- Broaden selector syntax beyond commit, DAG, and node resolution needs covered by the current DML surface. + +## Decisions + +### Introduce shared resolution helpers in `dml_resolution.py` + +`dml_resolution.py` will own helper functions for revision, DAG, and node resolution. Each helper will accept the operation dependencies it needs (`commit_ops`, `dag_ops`, `head_ops`, `project_dir`) and return resolved refs plus any minimal metadata needed by callers. + +Rationale: this keeps resolution behavior centralized without coupling the module to `Dml` instance internals. + +Alternative considered: keep resolution wrappers in `dml.py` and only move small parsing helpers. Rejected because it would leave behavior split across modules and preserve the current drift risk. + +### Make DAG and node resolution canonicalize to `Ref` + +DAG resolution will continue to accept either an explicit `dag:` ref or a DAG name resolved through a commit selector, but the resolved object returned to callers will always be a `Ref`. Node resolution will similarly return a node `Ref` whether the input was already a ref, a node-id style selector, or a name looked up through a DAG. + +Rationale: callers should not need to track whether a selector was direct or fuzzy after resolution succeeds. + +Alternative considered: keep mixed return shapes such as `(optional_ref, name)` and let each caller finish the lookup. Rejected because it pushes ambiguity handling back out to callers. + +### Define ambiguity handling around named node lookups + +Node resolution will recognize three cases: +- Direct node refs, which resolve immediately. +- Node-id style selectors such as `node-literal:abc123`, which are interpreted as canonical node refs if valid. +- Named node selectors, which require DAG context only when the selector is not already a direct ref. + +If the node selector is a name and multiple DAGs could satisfy the lookup without an explicit DAG selector, resolution must fail with a clear repository error asking for DAG disambiguation instead of guessing. + +Rationale: this preserves convenience for unambiguous selectors while making ambiguity explicit. + +Alternative considered: always require `dag_selector` for name-based node lookup. Rejected because it would remove an intended ergonomic path and is stricter than the requested behavior. + +### Keep `dml.py` as an orchestration layer only + +`dml.py` will stop parsing node selector strings or deciding when a DAG selector is mandatory. It will call `dml_resolution.py` helpers, then use the returned refs to build payloads and invoke ops methods. + +Rationale: `dml.py` should coordinate operations, not own selector semantics. + +Alternative considered: duplicate small guards in `dml.py` for readability. Rejected because “small” resolution checks tend to grow and recreate the current split. + +## Risks / Trade-offs + +- [Behavior drift in edge-case selectors] -> Add or update focused tests for direct refs, raw ids, named selectors, and ambiguous node lookups. +- [Resolution helpers may need more dependencies passed in] -> Prefer a small number of explicit helper parameters over reaching back into `Dml` state. +- [Existing callers may assume mixed return shapes] -> Update all DML internal call sites in the same change so the new contract is applied consistently. + +## Migration Plan + +1. Add the new shared resolution helpers in `dml_resolution.py`. +2. Update `dml.py` to delegate commit, DAG, and node selector handling to those helpers. +3. Adjust or add tests for canonical ref returns and ambiguity errors. +4. Run the relevant test suite for DML and selector-related behavior. + +Rollback is straightforward: the change is internal-only and can be reverted by restoring the previous helper split if regressions appear. + +## Open Questions + +- Whether node-name lookup without an explicit DAG selector should search only the selected commit’s named DAG map or also support broader repository-wide fallback. The current implementation intent suggests commit-scoped lookup, and this change assumes that narrower rule unless tests or existing behavior require otherwise. diff --git a/openspec/changes/archive/2026-05-10-centralize-dml-resolution/proposal.md b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/proposal.md new file mode 100644 index 0000000..b707ef6 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/proposal.md @@ -0,0 +1,25 @@ +## Why + +Resolution behavior for commits, DAGs, and nodes is currently split between `dml.py` and `dml_resolution.py`, which makes ambiguous selector handling harder to reason about and easier to drift out of sync. Centralizing all fuzzy resolution logic in one module will make the DML surface more predictable and give future callers a single contract for converting user selectors into canonical refs. + +## What Changes + +- Move all selector-resolution logic for commits, DAGs, and nodes into `src/daggerml/_internal/dml_resolution.py`. +- Define node resolution so it accepts either a direct node ref, a node-id style selector such as `node-literal:abc123`, or a node name plus optional dag selector. +- Require a dag selector only when named node resolution is ambiguous; direct node refs and node-id selectors resolve without DAG context. +- Standardize commit, DAG, and node resolution helpers to always return `Ref` instances for resolved objects. +- Remove remaining resolution logic from `src/daggerml/_internal/dml.py` so it delegates entirely to `dml_resolution.py`. + +## Capabilities + +### New Capabilities +- `dml-resolution`: Centralized DML selector resolution for commits, DAGs, and nodes with canonical `Ref` return values. + +### Modified Capabilities +- None. + +## Impact + +- Affected code: `src/daggerml/_internal/dml.py`, `src/daggerml/_internal/dml_resolution.py`, and any callers/tests that depend on selector-resolution behavior. +- API impact: DML-facing selector behavior becomes more explicit around ambiguous node lookups and canonical ref returns. +- System impact: Resolution rules move closer to a single internal boundary, reducing duplicate parsing and validation paths. diff --git a/openspec/changes/archive/2026-05-10-centralize-dml-resolution/specs/dml-resolution/spec.md b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/specs/dml-resolution/spec.md new file mode 100644 index 0000000..dc0f42f --- /dev/null +++ b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/specs/dml-resolution/spec.md @@ -0,0 +1,60 @@ +## ADDED Requirements + +### Requirement: Revision resolution returns canonical commit refs +The DML resolution layer SHALL accept supported revision selectors, including direct commit refs, commit ids, `HEAD` ancestry selectors, branch names, and supported `dml://` revision URIs, and SHALL resolve them to a canonical commit `Ref`. + +#### Scenario: Resolve a symbolic revision selector +- **WHEN** a caller resolves a supported symbolic revision selector such as `HEAD`, `HEAD~1`, a branch name, or a supported `dml://` URI +- **THEN** the resolution layer returns the corresponding commit `Ref` + +#### Scenario: Reject an invalid revision selector +- **WHEN** a caller resolves a revision selector that is empty, malformed, or points to an unsupported object namespace +- **THEN** the resolution layer raises `DmlRepoError` + +### Requirement: DAG resolution returns canonical dag refs +The DML resolution layer SHALL accept either a direct `dag:` ref or a DAG name combined with a revision selector and SHALL resolve the result to a canonical dag `Ref`. + +#### Scenario: Resolve an explicit dag ref +- **WHEN** a caller resolves a selector that is already a valid `dag:` ref +- **THEN** the resolution layer returns that dag as a `Ref` + +#### Scenario: Resolve a named dag from a revision +- **WHEN** a caller resolves a DAG name together with a commit-reachable revision selector +- **THEN** the resolution layer returns the dag `Ref` mapped to that name in the selected commit + +#### Scenario: Reject incompatible dag inputs +- **WHEN** a caller provides an explicit `dag:` ref together with an incompatible revision override +- **THEN** the resolution layer raises `DmlRepoError` + +### Requirement: Node resolution accepts direct refs, node-id selectors, and named lookups +The DML resolution layer SHALL accept node selectors as direct node refs, canonical node-id style selectors such as `node-literal:abc123`, or node names resolved through DAG context, and SHALL return a canonical node `Ref`. + +#### Scenario: Resolve a direct node ref +- **WHEN** a caller resolves a selector that is already a valid node `Ref` +- **THEN** the resolution layer returns that node as a `Ref` + +#### Scenario: Resolve a node-id style selector +- **WHEN** a caller resolves a selector string that matches a valid canonical node-id style selector +- **THEN** the resolution layer interprets it as a node `Ref` and returns it + +#### Scenario: Resolve a named node lookup +- **WHEN** a caller resolves a node name together with sufficient DAG context +- **THEN** the resolution layer returns the named node as a `Ref` + +### Requirement: Ambiguous node lookup requires dag disambiguation +The DML resolution layer MUST require an explicit DAG selector when a name-based node lookup cannot be resolved unambiguously from the available context, and it MUST fail with `DmlRepoError` instead of guessing. + +#### Scenario: Reject ambiguous named node lookup +- **WHEN** a caller resolves a node name without a direct node ref or canonical node-id selector and the available context does not identify a single DAG +- **THEN** the resolution layer raises `DmlRepoError` instructing the caller to provide DAG context + +#### Scenario: Allow unambiguous lookup without explicit dag selector +- **WHEN** a caller resolves a node name without a `dag_selector` and the available context identifies exactly one matching DAG +- **THEN** the resolution layer returns the matching node `Ref` + +### Requirement: DML delegates selector resolution to the shared resolution layer +The `dml.py` orchestration layer SHALL use shared helpers from `dml_resolution.py` for commit, DAG, and node selector handling instead of implementing independent selector parsing logic. + +#### Scenario: DML resolves a node selector +- **WHEN** DML code needs to resolve a node selector for a DAG operation +- **THEN** it uses the shared resolution layer and consumes the returned `Ref` rather than duplicating selector parsing rules locally diff --git a/openspec/changes/archive/2026-05-10-centralize-dml-resolution/tasks.md b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/tasks.md new file mode 100644 index 0000000..8ae6510 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-centralize-dml-resolution/tasks.md @@ -0,0 +1,16 @@ +## 1. Resolution Helpers + +- [x] 1.1 Expand `src/daggerml/_internal/dml_resolution.py` with shared helpers for canonical commit, DAG, and node resolution. +- [x] 1.2 Implement node selector handling for direct node refs, canonical node-id style selectors, and named node lookups with clear ambiguity errors. +- [x] 1.3 Ensure DAG and node resolution helpers return canonical `Ref` instances and reject incompatible selector combinations. + +## 2. DML Integration + +- [x] 2.1 Remove selector parsing and ambiguity logic from `src/daggerml/_internal/dml.py` and delegate to `dml_resolution.py`. +- [x] 2.2 Update any other internal DML call sites that depend on mixed resolution return shapes to use the new shared helper contract. + +## 3. Verification + +- [x] 3.1 Add or update tests covering revision, DAG, and node resolution with direct refs, raw ids, named selectors, and invalid inputs. +- [x] 3.2 Add or update tests covering ambiguous named node lookup and the requirement for explicit DAG disambiguation when needed. +- [x] 3.3 Run the relevant DML and selector-related test suite and fix any regressions. diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/.openspec.yaml b/openspec/changes/archive/2026-05-10-redesign-cli-surface/.openspec.yaml new file mode 100644 index 0000000..0478d8f --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-09 diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/design.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/design.md new file mode 100644 index 0000000..d39e50a --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/design.md @@ -0,0 +1,134 @@ +## Context + +The current CLI already has some git-like project verbs at the top level, but repository inspection still exposes internal storage nouns such as `commit`, `head`, `index`, `dag`, `node`, `cache`, and `remote` as the primary public interface. That makes common workflows harder to discover and couples the user model to implementation details instead of the repository concepts users care about: revisions, branch state, DAG maps, and administrative maintenance. + +This redesign is intentionally a breaking CLI reset. The new surface keeps git-shaped porcelain at the top level, moves DAG inspection under `dml dag`, moves exceptional maintenance flows under `dml admin`, and standardizes all CLI outputs as JSON without changing any on-disk repository or remote storage formats. + +The change is cross-cutting because it affects parser structure, CLI routing, JSON contracts, repository inspection entrypoints, DAG lookup flows, branch listing behavior, index reporting, and remote discovery/maintenance paths. + +## Goals / Non-Goals + +**Goals:** +- Present a coherent git-like top-level CLI for repository history and branch workflows. +- Make DAG inspection the first-class analogue to file inspection through `dml dag`. +- Isolate low-frequency maintenance flows under `dml admin`. +- Define stable JSON output contracts for the redesigned commands. +- Preserve thin CLI routing by moving orchestration and lookup behavior into non-CLI layers. +- Keep local and remote storage formats unchanged. + +**Non-Goals:** +- Preserving any backward-compatible aliases, old command names, or legacy output payloads. +- Reworking the repository data model, HEAD file format, config file format, or remote CAS+refs layout. +- Introducing text-mode porcelain output that mimics git's terminal formatting. +- Exposing low-level remote push/pull plumbing commands as part of the new public CLI. +- Expanding cache management beyond explicit invalidation by exact cache key. + +## Decisions + +### Top-level porcelain is revision-centric rather than storage-centric + +The public top-level surface will be `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert`. + +Rationale: +- This matches the mental model we want: users inspect repository history and branch state the way git users inspect trees and commits. +- It removes internal storage nouns from the common path. + +Alternatives considered: +- Keep `commit`, `head`, and `index` public and add aliases. Rejected because the change is explicitly breaking and the old nouns would keep the split mental model alive. +- Move everything under subcommands. Rejected because top-level verbs are part of the git-like feel. + +### `dml show` returns full DAG state plus commit delta + +`dml show ` will return top-level keys `revision`, `commit`, `dags`, and `change`. + +`dags` is the complete DAG name-to-ref map for the resolved revision. `change` is the DAG-map delta introduced by that commit relative to its base commit. + +Rationale: +- Users need both the complete tree picture and the specific change introduced by the commit. +- Keeping both in one payload avoids forcing a follow-up `dag list` call for context. + +Alternatives considered: +- Return only the diff. Rejected because it omits the complete tree state. +- Nest the full tree under `tree`. Rejected in favor of promoting `dags` to a top-level field for clarity and directness. + +### DAG inspection is organized by name-oriented lookups + +`dml dag list` returns `dict[str, str]` for a revision. `dml dag get [--revision REV]` returns a full DAG payload including node data. + +When `dag get` receives an explicit `dag:` selector, `--revision` is rejected. When it receives a plain name, the name is resolved against the DAG map for the selected revision. + +Rationale: +- Most human workflows start from a DAG name in a revision tree, not a raw DAG ref. +- One `dag get` endpoint is enough if the payload includes node data. + +Alternatives considered: +- Separate DAG metadata and node-inspection endpoints. Rejected because it keeps too much plumbing visible. +- Silently ignore `--revision` for `dag:`. Rejected because it hides an invalid combination. + +### Administrative workflows are isolated under `dml admin` + +The admin surface will contain: +- `index list|get|delete` +- `cache invalidate ...` +- `remote list [--owner OWNER]` +- `remote list dml:///` +- `remote gc` +- `gc [--dry-run]` + +Rationale: +- These are low-frequency maintenance or recovery flows, not normal repository inspection. +- Grouping them under `admin` keeps the main CLI focused while still exposing necessary escape hatches. + +Alternatives considered: +- `runtime` for indexes/cache. Rejected because local GC and remote maintenance are not runtime state, and `admin` better matches the operational nature of the commands. +- Keep `remote` as a public top-level group. Rejected because user-facing sync remains `fetch`, `pull`, and `push`. + +### Remote discovery uses one overloaded `admin remote list` + +`dml admin remote list [--owner OWNER]` lists projects as canonical `dml:///` URIs. `dml admin remote list dml:///` lists remote branches and tags for that project. + +Rationale: +- The overload follows the user's mental flow: list projects first, then inspect one project's remote refs. +- It avoids adding extra one-off verbs such as `list-projects` and `list-refs`. + +Alternatives considered: +- Separate `list-projects` and `list-refs`. Rejected because it adds naming surface without more expressive power. + +### Local and remote GC stay distinct + +`dml admin gc` cleans local scratch-space storage and supports `--dry-run`. `dml admin remote gc` performs remote maintenance, including remote CAS/ref GC and remote transport cleanup, under one user-facing command. + +Rationale: +- Local and remote cleanup have different stakes and should not be conflated. +- Remote prune and remote GC are implementation details that can be composed under one admin command. + +Alternatives considered: +- One combined local+remote GC command. Rejected because users must never wonder whether a local cleanup also touched the remote. +- Separate remote `prune` and `gc`. Rejected because the user explicitly wants one remote maintenance command. + +### Index reporting includes commit metadata, not just commit refs + +`dml admin index list` returns indexes plus the commits they point to, and `dml admin index get` returns index state including full commit information rather than only a commit ref. + +Rationale: +- Indexes are debugging/admin state. Returning the pointed-to commit metadata avoids immediate follow-up lookups and makes the admin commands useful on their own. + +Alternatives considered: +- Return only commit ids. Rejected because it is too sparse for an admin inspection endpoint. + +### Full config status moves to `dml config show` + +`dml status` becomes repository/runtime status. Full resolved config output moves to `dml config show [--contrib]` and remains JSON. + +Rationale: +- `status` should describe repository state in a git-shaped CLI. +- Config remains important, but it is a different concern. + +## Risks / Trade-offs + +- [Breaking CLI change] → Document the new command table clearly in specs, docs, and tests; do not preserve aliases that would muddy the new surface. +- [Cross-layer churn] → Introduce domain entrypoints for repository inspection and admin operations so the CLI remains thin. +- [Output contract drift] → Capture JSON payload shapes in specs and tests before implementation. +- [Remote list ambiguity] → Validate argument shape explicitly so project listing and per-project ref listing remain deterministic. +- [Remote GC scope confusion] → Keep `dml admin gc` and `dml admin remote gc` separate and document their different targets. +- [Admin command creep] → Limit `admin` to the locked set and defer additional plumbing commands unless a clear use case appears. diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/proposal.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/proposal.md new file mode 100644 index 0000000..5076203 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/proposal.md @@ -0,0 +1,29 @@ +## Why + +The current `dml` CLI mixes git-like project verbs with storage-oriented plumbing commands, which makes common repository inspection feel inconsistent and exposes internal object boundaries as the primary user model. We want the CLI to read like git for repository history and branch workflows while making DAG inspection the first-class analogue to file inspection. + +## What Changes + +- **BREAKING** Replace the current top-level CLI surface with a git-shaped porcelain centered on `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert`. +- **BREAKING** Replace storage-oriented inspection commands with DAG-oriented commands under `dml dag`, including `dag list`, `dag get`, `dag checkout`, and `dag delete`. +- **BREAKING** Move exceptional maintenance flows under `dml admin`, including index inspection/deletion, cache invalidation by cache key, remote discovery, remote GC, and local GC. +- Define stable JSON output contracts for the redesigned commands, including the locked `dml show` shape with top-level `revision`, `commit`, `dags`, and `change` keys. +- Redefine `dml status` as repository/runtime status instead of resolved config output, and move full config reporting to `dml config show [--contrib]`. +- Preserve thin CLI routing by keeping command handlers focused on parsing, delegation, and JSON serialization rather than embedding orchestration logic. + +## Capabilities + +### New Capabilities +- `repo-inspection-cli`: Git-shaped repository inspection and DAG inspection command contracts, including `show`, `status`, `log`, `diff`, `branch`, and `dag` output schemas. +- `admin-cli-controls`: Administrative command contracts for index management, cache invalidation, remote project discovery, and local/remote garbage collection. + +### Modified Capabilities +- `cli-thin-interface`: Document the intentional CLI compatibility break while preserving the requirement that CLI modules remain thin transport adapters. +- `git-like-commit-ops`: Extend git-like repository workflows to cover branch listing semantics and revision-oriented inspection flows that power the new CLI surface. + +## Impact + +- Affects `src/daggerml/_cli/**`, `src/daggerml/_internal/ops/**`, CLI docs, and command contract tests. +- Changes the public CLI grammar and JSON payloads for existing commands. +- Requires new domain entrypoints for repository inspection, DAG lookup by revision/name, admin remote discovery, and richer index reporting. +- Keeps existing on-disk state formats and remote storage layout unchanged. diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/admin-cli-controls/spec.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/admin-cli-controls/spec.md new file mode 100644 index 0000000..b41f98a --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/admin-cli-controls/spec.md @@ -0,0 +1,79 @@ +## ADDED Requirements + +### Requirement: Administrative CLI flows are grouped under `dml admin` +Low-frequency maintenance and recovery commands SHALL be exposed under `dml admin` rather than as top-level porcelain commands. + +#### Scenario: Admin help groups maintenance commands +- **WHEN** a user inspects `dml admin` help +- **THEN** index management, cache invalidation, remote discovery, remote garbage collection, and local garbage collection appear under `dml admin` + +### Requirement: Admin index list returns indexes with commit info +`dml admin index list` SHALL return every live index together with commit information for the commit each index currently points to. + +#### Scenario: Index list includes commit summaries +- **WHEN** a user runs `dml admin index list` +- **THEN** the command returns JSON with an `indexes` field +- **AND** each index entry includes its identifier and commit information for the pointed-to commit + +### Requirement: Admin index get returns full index inspection payload +`dml admin index get ` SHALL return index inspection data including commit information for the commit the index points to, rather than only a commit identifier. + +#### Scenario: Index get includes commit details +- **WHEN** a user runs `dml admin index get idx1` +- **THEN** the command returns JSON with an `index` object +- **AND** that object includes commit metadata for the pointed-to commit + +### Requirement: Admin index delete removes an index +`dml admin index delete ` SHALL delete the selected index and report the deletion result as JSON. + +#### Scenario: Index delete reports success +- **WHEN** a user runs `dml admin index delete idx1` +- **THEN** the command returns JSON containing `index` and `deleted` + +### Requirement: Admin cache invalidation accepts exact cache keys only +`dml admin cache invalidate [more cache keys]` SHALL accept one or more exact cache keys and SHALL NOT accept DAG refs, argv refs, or other selector types. + +#### Scenario: Cache invalidation accepts multiple exact keys +- **WHEN** a user runs `dml admin cache invalidate ck1 ck2` +- **THEN** the command invalidates those exact cache keys +- **AND** returns JSON containing `cache_keys` and `invalidated` + +#### Scenario: Cache invalidation rejects non-key selector forms +- **WHEN** a user runs `dml admin cache invalidate dag:abc123` +- **THEN** the command fails because admin cache invalidation accepts exact cache keys only + +### Requirement: Admin remote list can list projects or one project's refs +`dml admin remote list` SHALL support two modes through one command shape. + +Without a project argument, it SHALL list remote projects as canonical `dml:///` URIs and MAY filter by owner. With a `dml:///` argument, it SHALL list the remote branches and tags for that project. + +#### Scenario: Remote list returns projects +- **WHEN** a user runs `dml admin remote list` +- **THEN** the command returns JSON with a `projects` field containing canonical project URIs + +#### Scenario: Remote list filters by owner +- **WHEN** a user runs `dml admin remote list --owner alice` +- **THEN** the command returns only projects owned by `alice` + +#### Scenario: Remote list returns project refs +- **WHEN** a user runs `dml admin remote list dml://alice/demo` +- **THEN** the command returns JSON containing `project`, `branches`, and `tags` + +### Requirement: Admin remote GC performs remote maintenance +`dml admin remote gc` SHALL perform remote maintenance for the configured remote, including remote GC of CAS/refs state and remote transport cleanup, and SHALL report the result as JSON. + +#### Scenario: Remote GC reports cleanup summary +- **WHEN** a user runs `dml admin remote gc` +- **THEN** the command returns JSON summarizing deleted remote refs, CAS objects, and transport objects + +### Requirement: Admin local GC supports dry-run inspection +`dml admin gc` SHALL garbage-collect unreachable local objects. When `--dry-run` is provided, it SHALL report what would be deleted without deleting it. + +#### Scenario: Local GC deletes unreachable objects +- **WHEN** a user runs `dml admin gc` +- **THEN** the command returns JSON describing deleted local objects + +#### Scenario: Local GC dry run reports orphans +- **WHEN** a user runs `dml admin gc --dry-run` +- **THEN** the command returns JSON containing `dry_run`, `would_delete`, and `orphans` +- **AND** the command does not delete local objects diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/cli-thin-interface/spec.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/cli-thin-interface/spec.md new file mode 100644 index 0000000..2b741ea --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/cli-thin-interface/spec.md @@ -0,0 +1,12 @@ +## MODIFIED Requirements + +### Requirement: CLI output contract remains stable through documented compatibility changes +Refactoring to enforce a thin CLI boundary MUST preserve documented user-visible command semantics, including success output structure and failure signaling, except where a change explicitly defines a breaking CLI compatibility update. + +#### Scenario: Refactor preserves behavior outside documented breaks +- **WHEN** CLI logic is moved into domain modules for commands whose public contract is unchanged by an approved change +- **THEN** command outputs and exit outcomes remain equivalent for existing supported invocations + +#### Scenario: Approved CLI redesign may replace old command contracts +- **WHEN** an approved change explicitly redefines the public CLI grammar and JSON payloads +- **THEN** the implementation MAY remove prior command names and prior output payload shapes for the affected commands diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..43588df --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/git-like-commit-ops/spec.md @@ -0,0 +1,47 @@ +## ADDED Requirements + +### Requirement: Repository inspection workflows resolve revisions locally +The system SHALL provide repository inspection workflows for `show`, `log`, and `diff` that resolve revisions locally without performing implicit network fetches. + +#### Scenario: Show resolves revision locally +- **WHEN** a user runs `dml show origin/main` +- **THEN** the system resolves `origin/main` through existing local tracking state +- **AND** it does not contact the remote automatically + +#### Scenario: Diff resolves both revisions locally +- **WHEN** a user runs `dml diff dml://alice/demo#main HEAD` +- **THEN** the system resolves both revisions from local state only + +### Requirement: Branch listing exposes remote-tracking branches +The system SHALL support listing locally tracked remote branches for git-like branch inspection. + +#### Scenario: Branch remote lists tracked refs +- **WHEN** a user runs `dml branch --remote` +- **THEN** the system returns the set of locally tracked remote branch selectors + +### Requirement: Repository status reports current DAG map and live indexes +The system SHALL provide a repository status workflow that reports the current HEAD state, local branches, the DAG map for the current revision, and live indexes. + +#### Scenario: Status reports attached head +- **WHEN** HEAD is attached to branch `main` and a user runs `dml status` +- **THEN** the response reports attached head state for `main` +- **AND** includes the DAG map for the commit selected by that head + +#### Scenario: Status reports detached head +- **WHEN** HEAD is detached and a user runs `dml status` +- **THEN** the response reports detached head state and the current commit + +### Requirement: Show returns commit delta over DAG namespace +The system SHALL compute commit-introduced change for `dml show` as DAG-map additions, removals, and updates between the selected commit tree and its base tree. + +#### Scenario: Show detects DAG addition +- **WHEN** a commit introduces `train -> dag:a` where the base tree had no `train` +- **THEN** `dml show` reports `train` under `change.added` + +#### Scenario: Show detects DAG update +- **WHEN** a commit changes `train` from `dag:a` to `dag:b` +- **THEN** `dml show` reports `train` under `change.updated` with `before` and `after` + +#### Scenario: Show detects DAG removal +- **WHEN** a commit removes `train -> dag:a` +- **THEN** `dml show` reports `train` under `change.removed` diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/repo-inspection-cli/spec.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/repo-inspection-cli/spec.md new file mode 100644 index 0000000..3c7717d --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/specs/repo-inspection-cli/spec.md @@ -0,0 +1,101 @@ +## ADDED Requirements + +### Requirement: Top-level CLI uses git-shaped repository inspection verbs +The public `dml` CLI SHALL expose repository-oriented porcelain commands at the top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert`. + +#### Scenario: Top-level help reflects git-shaped porcelain +- **WHEN** a user inspects the top-level CLI surface +- **THEN** the documented primary commands are `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert` + +### Requirement: Status reports repository state instead of config state +`dml status` SHALL report current repository/runtime status as JSON, including the current HEAD state, available local branches, DAG map for the current revision, and live indexes. + +#### Scenario: Status returns repository summary +- **WHEN** a user runs `dml status` +- **THEN** the command returns JSON with `head`, `branches`, `dags`, and `indexes` fields + +### Requirement: Show returns commit metadata, full DAG map, and commit delta +`dml show ` SHALL resolve the revision locally and return JSON with top-level `revision`, `commit`, `dags`, and `change` fields. + +The `dags` field SHALL be the full DAG name-to-ref map for the resolved revision. The `change` field SHALL describe the DAG-map delta introduced by the resolved commit relative to its base commit. + +#### Scenario: Show returns full DAG map and change +- **WHEN** a user runs `dml show HEAD` +- **THEN** the command returns JSON containing `revision`, `commit`, `dags`, and `change` +- **AND** `dags` contains the complete DAG map for the resolved commit + +#### Scenario: Show root commit uses empty base +- **WHEN** a user runs `dml show` on a root commit with no parents +- **THEN** `change.base` is `null` +- **AND** every DAG in `dags` appears as an addition in `change` + +#### Scenario: Show merge commit uses first parent as base +- **WHEN** a user runs `dml show` on a merge commit with multiple parents +- **THEN** `change` is computed relative to the first parent commit + +### Requirement: Diff compares DAG maps between revisions +`dml diff [] []` SHALL compare two locally resolved revisions and return DAG-map differences as JSON `added`, `removed`, and `updated` sections. + +#### Scenario: Diff returns DAG map changes +- **WHEN** a user runs `dml diff main feature` +- **THEN** the command returns JSON with `left`, `right`, `added`, `removed`, and `updated` fields + +### Requirement: Log returns commit entries for a revision walk +`dml log [] [--limit N]` SHALL return commit entries starting from the resolved revision, defaulting to `HEAD`. + +#### Scenario: Log defaults to HEAD +- **WHEN** a user runs `dml log` +- **THEN** the command resolves `HEAD` +- **AND** returns JSON containing `revision` and `commits` + +### Requirement: Branch listing supports local and remote-tracking views +`dml branch` SHALL list local branches. `dml branch -r` and `dml branch --remote` SHALL list remote-tracking branches. + +#### Scenario: Branch lists local branches by default +- **WHEN** a user runs `dml branch` +- **THEN** the command returns JSON with a `branches` field containing local branch names + +#### Scenario: Branch lists remote-tracking branches +- **WHEN** a user runs `dml branch --remote` +- **THEN** the command returns JSON with a `branches` field containing remote-tracking branch selectors + +### Requirement: DAG inspection is organized under `dml dag` +The CLI SHALL expose DAG-oriented inspection commands under `dml dag`: `list`, `get`, `checkout`, and `delete`. + +#### Scenario: DAG commands are grouped under dag +- **WHEN** a user inspects DAG-related CLI help +- **THEN** DAG inspection and DAG tree mutation commands appear under `dml dag` + +### Requirement: DAG list returns revision-scoped DAG map +`dml dag list [--revision REV]` SHALL return the DAG name-to-ref map for the selected revision as JSON. + +#### Scenario: DAG list returns mapping +- **WHEN** a user runs `dml dag list --revision HEAD~1` +- **THEN** the command returns JSON with `revision` and `dags` +- **AND** `dags` is an object mapping DAG names to DAG refs + +### Requirement: DAG get resolves by name or exact DAG ref +`dml dag get [--revision REV]` SHALL resolve either a DAG name within a revision's DAG map or an explicit `dag:` selector. + +If the selector is `dag:`, the command SHALL reject any provided `--revision` flag. + +#### Scenario: DAG get resolves name in revision +- **WHEN** a user runs `dml dag get train --revision HEAD~1` +- **THEN** the command resolves `train` in the DAG map for `HEAD~1` +- **AND** returns JSON containing `selector`, `revision`, and `dag` + +#### Scenario: DAG get loads exact DAG ref +- **WHEN** a user runs `dml dag get dag:abc123` +- **THEN** the command loads that exact DAG object +- **AND** returns JSON containing `selector` and `dag` + +#### Scenario: DAG get rejects revision with explicit DAG ref +- **WHEN** a user runs `dml dag get dag:abc123 --revision HEAD` +- **THEN** the command fails without resolving a revision + +### Requirement: DAG get includes node data +The `dml dag get` payload SHALL include the DAG's node data so that users do not need a separate DAG-node inspection endpoint for normal CLI workflows. + +#### Scenario: DAG get includes nodes +- **WHEN** a user runs `dml dag get train` +- **THEN** the returned `dag` object includes node-level data needed for DAG inspection diff --git a/openspec/changes/archive/2026-05-10-redesign-cli-surface/tasks.md b/openspec/changes/archive/2026-05-10-redesign-cli-surface/tasks.md new file mode 100644 index 0000000..2b09ffa --- /dev/null +++ b/openspec/changes/archive/2026-05-10-redesign-cli-surface/tasks.md @@ -0,0 +1,31 @@ +## 1. Restructure the public CLI surface + +- [x] 1.1 Replace the top-level parser surface with the locked porcelain commands (`status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert`, `dag`, `admin`, `config`). +- [x] 1.2 Remove legacy public command groups and update help text/examples to reflect the new breaking CLI grammar. +- [x] 1.3 Update `dml config` so `config show [--contrib]` becomes the JSON config-status entrypoint. + +## 2. Add repository inspection workflows + +- [x] 2.1 Add domain entrypoints for repository `status`, `show`, `log`, and `diff` so CLI handlers remain thin. +- [x] 2.2 Implement `dml show` payload generation with top-level `revision`, `commit`, `dags`, and `change` fields. +- [x] 2.3 Implement DAG-map diff computation for commit-to-base and revision-to-revision comparisons. +- [x] 2.4 Add branch listing support for both local branches and remote-tracking branches used by `dml branch` and `dml branch --remote`. + +## 3. Redesign DAG inspection commands + +- [x] 3.1 Replace current DAG CLI commands with `dag list`, `dag get`, `dag checkout`, and `dag delete`. +- [x] 3.2 Add revision-scoped DAG lookup by name and exact DAG lookup by `dag:`, including rejection of `--revision` with explicit DAG refs. +- [x] 3.3 Expand DAG inspection payloads so `dml dag get` returns the full DAG payload including node data. + +## 4. Implement admin workflows + +- [x] 4.1 Add `dml admin index list|get|delete` and return commit metadata in both list and get responses. +- [x] 4.2 Add `dml admin cache invalidate [more-keys...]` using exact cache-key inputs only. +- [x] 4.3 Add overloaded `dml admin remote list [--owner OWNER]` and `dml admin remote list dml:///` discovery workflows. +- [x] 4.4 Add `dml admin remote gc` as the unified remote maintenance command and `dml admin gc [--dry-run]` for local GC. + +## 5. Verify contracts and documentation + +- [x] 5.1 Update CLI contract tests to cover the new command grammar and JSON payloads, including admin index commit-info responses. +- [x] 5.2 Update repository/admin docs to match the new CLI surface and command semantics. +- [x] 5.3 Run the relevant CLI and internal contract test suites and resolve any failures. diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/design.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/design.md new file mode 100644 index 0000000..5fb04e3 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/design.md @@ -0,0 +1,227 @@ +## Context + +The original change was written as a full end-to-end boundary migration, but the branch only landed the first half of that story: + +- `_internal.__init__` now lazy-exports a future shared `Dml` surface plus helper functions. +- `_internal.dml_context` now centralizes config-derived runtime/project helpers. +- Some `_internal` modules have started importing through the shared `_internal` export surface. + +The missing part is the actual boundary itself. `_internal.__init__` references `daggerml._internal.dml` and `daggerml._internal.dml_resolution`, but those modules do not exist yet. That leaves the change marked complete while the shared internal orchestration boundary is still absent. + +This design narrows the change to what the branch is actually ready for: finish the missing `_internal` boundary modules and wiring, keep the blast radius inside `src/daggerml/_internal/`, and defer broader API/CLI/contrib cleanup until a follow-up. + +## Goals / Non-Goals + +**Goals:** +- Make `_internal.Dml` and `_internal.dml_resolution` real and importable. +- Reuse the already-added `_internal.dml_context` helpers rather than moving context logic again. +- Keep the shared `Dml` orchestration boundary delegated to existing ops classes instead of re-implementing repository mechanics. +- Preserve the fixed namespaced method surface expected by current callers. +- Limit the remaining implementation to a few files under `src/daggerml/_internal/`. + +**Non-Goals:** +- Removing `daggerml.api.Dml` or `DmlOps` compatibility surfaces in this change. +- Rewriting CLI handlers or contrib integrations. +- Expanding selector grammar beyond what is already specified elsewhere. +- Changing repository storage formats, remote schemas, or commit/tree semantics. + +## Current State Snapshot + +```text +callers/importers + | + v +daggerml._internal.__init__ + | + +--> dml_context.py [present] + +--> ops/* [present] + +--> dml_resolution.py [missing] + +--> dml.py / Dml [missing] +``` + +## Decisions + +### Treat the landed `_internal` work as groundwork, not completion + +The existing `_internal.__init__` export expansion and `_internal.dml_context` module are part of the intended architecture and should be recorded as completed groundwork for this change rather than rolled back or ignored. + +Rationale: +- Those files already establish the shape of the future boundary. +- They are useful once the missing `Dml` and resolution modules exist. +- Reframing them as groundwork makes the remaining plan honest about what is done and what is not. + +Alternatives considered: +- Revert the groundwork and restart the change. Rejected because the landed context/export helpers are aligned with the intended end state. + +### Finish the boundary with two new `_internal` modules and light wiring + +The remaining implementation should add: + +- `src/daggerml/_internal/dml_resolution.py` for revision and DAG-selector helpers. +- `src/daggerml/_internal/dml.py` for the shared context-managed `Dml` facade. +- Minimal adjacent wiring inside `_internal` if import cycles or export cleanup require it. + +`Dml` should delegate config-derived context lookup to `dml_context` and repository actions to existing ops classes. + +Rationale: +- The missing modules are the actual reason the boundary is incomplete. +- Adding them finishes the architecture already implied by `_internal.__init__`. +- Keeping the work inside `_internal` avoids reopening a wide migration while the core boundary is still absent. + +Alternatives considered: +- Expand the remaining work back out to `api`, `_cli`, and `contrib` in the same change. Rejected because the missing `_internal` boundary is still the blocking prerequisite. + +### Preserve compatibility wrappers until a follow-up cleanup + +This change does not need to remove `daggerml.api.Dml` or `DmlOps` immediately. The important step now is to make `_internal.Dml` real and canonical so other layers can converge on it without importing missing modules. + +Rationale: +- Current callers and contrib helpers still reference compatibility surfaces. +- Removing them now would expand the scope far beyond the missing `_internal` work. +- Once `_internal.Dml` exists, follow-up cleanup becomes mechanical instead of speculative. + +Alternatives considered: +- Remove all compatibility entrypoints now. Rejected because it would require many extra file edits outside `_internal`. + +### `Dml` orchestrates by delegating to the relevant subsystem + +The new `_internal.Dml` should coordinate workflows by farming repository actions to the relevant lower-level subsystem instead of re-implementing repository mechanics itself. + +Delegation matrix: + +| `Dml` responsibility | Delegated owner | +| --- | --- | +| fuzzy revision and DAG-selector resolution | fuzzy-resolution submodule | +| current head, default branch, remote-uri, and related config-derived context | config submodule | +| `show`, `log`, `diff`, `merge`, `revert`, revision-scoped DAG-map inspection | `CommitOps` | +| branch and HEAD state reads/writes | `HeadOps` | +| exact DAG reads and DAG inspection payload assembly inputs | `DagOps` | +| runtime index creation, staging, execution, and commit finalization | `IndexOps` | +| node materialization and unrolling | `NodeOps` | +| cache invalidation and cache-backed runtime support | `CacheOps` | +| remote discovery, fetch/pull/push support, and remote maintenance | `RemoteOps` | +| local garbage collection | `GcOps` | + +Examples: +- commit-oriented workflows such as `show`, `log`, `diff`, `merge`, `revert`, and revision-scoped DAG-map inspection delegate to `CommitOps` +- head and branch state workflows delegate to `HeadOps` +- DAG inspection and exact DAG reads delegate to `DagOps` +- runtime staging and commit-finalization workflows delegate to `IndexOps` +- node materialization delegates to `NodeOps` +- cache invalidation delegates to `CacheOps` +- remote discovery and maintenance delegates to `RemoteOps` +- local garbage collection delegates to `GcOps` + +Rationale: +- This preserves the existing subsystem ownership boundaries. +- It keeps `Dml` focused on caller-facing workflow composition, not storage mechanics. + +Alternatives considered: +- Re-implement commit, head, or index logic directly in `Dml`. Rejected because it would flatten subsystem boundaries and duplicate repository logic. + +### The public boundary is fixed top-level porcelain plus namespaces + +The shared `_internal.Dml` class should expose: + +- top-level repository methods: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert` +- domain namespaces: `dag`, `admin`, `runtime`, `config` +- exact-subsystem namespace: `ops` + +`ops` is an intentional low-level escape hatch for exact subsystem objects such as `CommitOps`, `HeadOps`, `DagOps`, `NodeOps`, `IndexOps`, `CacheOps`, `RemoteOps`, `GcOps`, and `ConfigOps`. These objects remain publicly reachable under `dml.ops.*`, but they are not promoted to direct top-level `Dml` attributes. + +Rationale: +- This keeps the main caller-facing model aligned with the redesigned CLI and the intended Python-facing domain surface. +- It still preserves access to exact subsystem contracts for wrappers, tests, and advanced integrations that need them. +- It avoids reintroducing legacy storage-oriented nouns as first-class top-level public entrypoints on `Dml`. + +Alternatives considered: +- Expose raw subsystem factories (`commit`, `head`, `index`, etc.) directly on `Dml`. Rejected because it reintroduces storage-oriented mental models into the primary boundary. +- Hide all exact subsystem objects. Rejected because wrappers and internal integrations still need a sanctioned exact-input escape hatch. + +### The constructor matches the current runtime/context plumbing + +The shared `Dml` constructor should accept the root runtime override inputs already threaded through callers: project-home, remote-uri, user, and config-home context. Construction establishes the repository/runtime context, and methods resolve any additional omitted values through `dml_context` inside the method body. + +Rationale: +- This lets CLI handlers instantiate `Dml` directly from global parsed args. +- It gives API wrappers the same context model instead of inventing a parallel constructor contract. + +Alternatives considered: +- Separate CLI-only and API-only constructors. Rejected because it recreates two runtime entrypoints. +- Push repo/remote context into every method instead of the constructor. Rejected because it would make call sites noisy and duplicate context normalization. + +### The fixed namespaced method table remains the boundary target + +The unified class will expose the already-chosen method table: + +- top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert` +- `dag`: `list`, `get`, `checkout`, `delete` +- `admin.index`: `list`, `get`, `delete` +- `admin.cache`: `invalidate` +- `admin.remote`: `list`, `gc` +- `admin`: `gc` +- `runtime`: `create`, `describe`, `put_literal`, `put_import`, `start_fn`, `commit` +- `config`: `get`, `set`, `show` +- `ops`: `commit`, `head`, `dag`, `node`, `index`, `cache`, `remote`, `gc`, `config` + +Rationale: +- This preserves the porcelain-vs-admin-vs-runtime split already chosen while making config explicit and keeping exact subsystem access under one intentionally low-level namespace. +- It keeps the CLI and Python-facing boundary aligned around one vocabulary. + +Alternatives considered: +- Put config behavior on top-level `Dml` methods. Rejected because config is a distinct concern and already reads naturally as a namespace. + +### Return values are JSON-ready payloads with typed leaves allowed + +`Dml` methods will return plain dict/list/bool/int/str/None payloads for container structure, but leaf values may still include `Ref`, `Uri`, `Error`, and `Runnable` objects for shared encoding and wrapper use. + +Rationale: +- This keeps CLI handlers thin: parse, call `Dml`, JSON-encode. +- It avoids premature stringification in the domain layer while keeping result shapes serialization-friendly. + +Alternatives considered: +- Return rich result objects for porcelain methods. Rejected because it would force CLI-specific reshaping logic back into callers. +- Stringify all typed leaves inside `Dml`. Rejected because the codebase already has stable encoders for several typed leaf objects. + +### Init and recovery live on the shared internal `Dml` + +Repository bootstrap and recovery workflows should be exposed on the shared `_internal.Dml` class while preserving the existing config-first recovery behavior and using `dml_context` plus the relevant ops classes to perform the work. + +Rationale: +- Removing `DmlOps` requires a new owner for bootstrap workflows. +- Init is part of the caller-facing repository boundary and fits the new role of `Dml`. + +Alternatives considered: +- Leave a minimal `DmlOps` only for init. Rejected because it would preserve a second orchestration entrypoint after the rest of the class is removed. + +### Implementation proceeds from shell to namespaces to porcelain + +The implementation should be staged in dependency order rather than by broad feature bucket. + +Recommended order: + +1. establish the `Dml` shell with only `_context` and `_tempdirs`, plus context-manager lifecycle and private helper stubs +2. add the `ops` namespace so exact subsystem objects are available under one sanctioned low-level entrypoint +3. add the `config` namespace because it is thin, mostly delegated, and validates the namespace pattern early +4. add the `runtime` namespace because active DAG runtime workflows are central to wrapper compatibility and depend mostly on `IndexOps` +5. add `dml_resolution.py` so revision and DAG-selector behavior is centralized before higher-level caller-facing namespaces rely on it +6. add the `dag` namespace on top of resolution plus existing DAG/node ops +7. add the `admin` namespace after the underlying subsystem entrypoints already exist +8. add the top-level porcelain workflows last, reusing the namespaces and resolution helpers rather than inventing parallel paths +9. add bootstrap/recovery flows (`create`, `temporary`, `init`) once the surrounding config/ops plumbing is already in place + +Rationale: +- This reduces circular design pressure while the shared boundary is still being formed. +- It validates the public namespace model before layering full repository porcelain on top. +- It keeps `Dml` itself small by forcing most behavior into namespaces or delegated helpers first. + +Alternatives considered: +- Implement top-level porcelain first. Rejected because it encourages one-off helper logic before the namespace model is stable. +- Start with init/recovery first. Rejected because bootstrap touches too many surrounding concerns to be the clean first landing point. + +## Risks / Trade-offs + +- [The branch already imports missing modules] → Finish the missing modules first and keep the remaining patch small. +- [Temporary duplicate entrypoints] → Keep `api.Dml` and `DmlOps` as compatibility surfaces until a follow-up cleanup change. +- [Overgrown `Dml`] → Keep namespace boundaries explicit and keep transactional storage logic in ops classes. +- [Import-cycle risk from `_internal.__init__`] → Limit new wiring to the minimum needed for the shared boundary to import cleanly. diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/proposal.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/proposal.md new file mode 100644 index 0000000..21a9b18 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/proposal.md @@ -0,0 +1,28 @@ +## Why + +The branch already contains part of the unification work under `src/daggerml/_internal/`: `_internal.__init__` now expects a shared `Dml` export, `_internal.dml_context` centralizes config-derived runtime/project helpers, and some internal modules have started importing through the `_internal` export surface. + +The change drifted because the core `_internal.dml.Dml` facade and `_internal.dml_resolution` helpers were never added, while the change artifacts were still marked complete. We need to realign the change with what is actually present and finish the missing boundary work with a small `_internal`-only patch instead of pretending the full API/CLI/contrib migration already landed. + +## What Changes + +- Complete the missing shared `_internal.Dml` boundary and `_internal.dml_resolution` module that the current `_internal` export surface already references. +- Keep `_internal.dml_context` as the config/context owner and make the shared `Dml` delegate through it plus the existing ops classes. +- Finish the remaining work in a few files under `src/daggerml/_internal/` instead of expanding the blast radius across `api`, `cli`, and `contrib` in this change. +- Preserve existing compatibility surfaces such as `daggerml.api.Dml` and `DmlOps` for now; broader removal/cleanup can happen in a follow-up once the shared internal boundary is real. + +## Capabilities + +### New Capabilities +- `unified-dml-surface`: One caller-facing `Dml` contract shared by API wrappers and CLI handlers, including the fixed top-level methods plus `dag`, `admin`, `runtime`, `config`, and `ops` namespaces. + +### Modified Capabilities +- `git-like-commit-ops`: Finish the shared `Dml` entrypoint that owns project-workflow orchestration while reusing the already-landed ops/context groundwork. +- `dmlops-init-recovery`: Preserve init/recovery behavior by putting the bootstrap entrypoint on the new shared internal `Dml` class. + +## Impact + +- Affects only a small `_internal` slice: `src/daggerml/_internal/__init__.py`, `src/daggerml/_internal/dml_context.py`, the new `src/daggerml/_internal/dml.py`, the new `src/daggerml/_internal/dml_resolution.py`, and any minimal adjacent wiring needed inside `src/daggerml/_internal/`. +- Changes the internal layering contract by making the shared `_internal.Dml` boundary real instead of just referenced by exports and callers. +- Leaves API, CLI, contrib, and broad compatibility cleanup to follow-up work once the internal boundary exists and is importable. +- Keeps on-disk repository formats, revision grammar, and existing caller contracts unchanged in this phase. diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/dmlops-init-recovery/spec.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/dmlops-init-recovery/spec.md new file mode 100644 index 0000000..4feafcc --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/dmlops-init-recovery/spec.md @@ -0,0 +1,19 @@ +## MODIFIED Requirements + +### Requirement: Init recovers missing DB when project config already exists +The system SHALL treat `.dml/config.toml` + missing `.dml/db/` as a recoverable initialization state through the shared internal `Dml` bootstrap workflow. + +#### Scenario: Existing config with missing DB is recovered +- **WHEN** the `Dml` init/bootstrap workflow runs in a project where `.dml/config.toml` exists and `.dml/db/` does not +- **THEN** initialization uses `dml_context` to resolve bootstrap context, creates `.dml/db/`, and completes without requiring manual repository repair + +### Requirement: Recovery mode pulls when a project URI is configured +The system SHALL perform project bootstrap pull during recovery when resolved configuration includes `remote.project`. + +#### Scenario: Recovery triggers pull when project URI is present +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config includes `remote.project` +- **THEN** it uses `dml_context` to obtain the resolved project and remote configuration and runs pull through the relevant ops-backed workflow to populate local repository state + +#### Scenario: Recovery skips pull when project URI is absent +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config has no `remote.project` +- **THEN** it creates local DB state without invoking pull diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/git-like-commit-ops/spec.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..9928e44 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/git-like-commit-ops/spec.md @@ -0,0 +1,28 @@ +## MODIFIED Requirements + +### Requirement: Git-like project workflows are owned by `Dml` orchestration +Git-like project command workflows SHALL be available through the shared internal `Dml` orchestration boundary, which coordinates commit, head, and remote operations while delegating concrete repository actions to lower-level ops classes. + +#### Scenario: Pull executes through Dml workflow +- **WHEN** a caller invokes project pull with remote target, branch target, and user context +- **THEN** `Dml` obtains project and remote context through `dml_context`, resolves any fuzzy selectors through its fuzzy-resolution submodule, performs remote synchronization, and applies merge behavior through internal ops + +#### Scenario: Push executes through Dml workflow +- **WHEN** a caller invokes project push with remote target and push options +- **THEN** `Dml` obtains project and remote context through `dml_context`, performs project-aware remote push behavior through the relevant ops classes, and returns the push result through the shared boundary + +#### Scenario: Revert executes through Dml workflow +- **WHEN** a caller invokes project revert with revision, branch target, and user context +- **THEN** `Dml` resolves the revision through its fuzzy-resolution submodule and performs revert behavior through `CommitOps` + +#### Scenario: Checkout executes through Dml workflow +- **WHEN** a caller invokes repository checkout with a revision value +- **THEN** `Dml` resolves the revision through its fuzzy-resolution submodule and performs attached-vs-detached checkout behavior through the relevant ops classes + +#### Scenario: Init runs through Dml-owned project setup +- **WHEN** a caller invokes repository init/bootstrap behavior +- **THEN** `Dml` initializes project state under `.dml/` in the current location through the shared internal boundary instead of requiring a separate bootstrap entrypoint + +#### Scenario: Init recovers config-first partial state +- **WHEN** `.dml/config.toml` exists but `.dml/db/` is missing at init time +- **THEN** the Dml-owned init workflow uses `dml_context` to resolve bootstrap context, creates the missing DB state, and continues bootstrap behavior through the relevant ops classes diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..4bfe516 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/specs/unified-dml-surface/spec.md @@ -0,0 +1,113 @@ +## ADDED Requirements + +### Requirement: One shared `_internal.Dml` class is the canonical orchestration boundary +The system SHALL expose one shared `_internal.Dml` class for repository, DAG, admin, and runtime workflows. + +#### Scenario: CLI delegates through shared Dml +- **WHEN** a CLI command executes a repository, DAG, admin, or runtime workflow +- **THEN** the handler instantiates or receives a `Dml` instance and delegates through that class instead of orchestrating lower-level ops classes directly + +#### Scenario: API wrappers delegate through shared Dml +- **WHEN** `Dag` or `Node` wrappers need repository/runtime behavior +- **THEN** they delegate through the shared internal `Dml` implementation, whether by direct use or by a thin compatibility wrapper in `daggerml.api` + +### Requirement: `Dml` delegates fuzzy and config resolution to dedicated submodules +The shared `Dml` class SHALL remain the sole caller-facing boundary for fuzzy selector and config-derived context behavior, but it SHALL farm fuzzy selector resolution to a dedicated fuzzy-resolution submodule and config-derived context lookup to a dedicated config submodule. + +#### Scenario: Revision parsing delegates to fuzzy-resolution submodule +- **WHEN** a caller passes a supported revision string to a `Dml` repository method +- **THEN** `Dml` delegates the fuzzy parsing and resolution step to the fuzzy-resolution submodule before invoking lower-level ops + +#### Scenario: Current head and remote context delegate to config submodule +- **WHEN** a `Dml` workflow needs current head state, default branch behavior, or remote-uri context +- **THEN** `Dml` obtains that config-derived context through the config submodule before invoking lower-level ops + +### Requirement: Shared `Dml` constructor uses root runtime override inputs +The shared `Dml` constructor SHALL accept the root runtime override inputs already threaded through callers for project-home, remote-uri, user, and config-home context. + +#### Scenario: CLI globals map directly to constructor +- **WHEN** a caller provides explicit project-home, remote-uri, user, or config-home runtime overrides +- **THEN** those values can be passed directly to the shared `Dml` constructor without a separate caller-specific context adapter + +### Requirement: Shared `Dml` exposes the fixed method namespaces +The shared `Dml` class SHALL expose this caller-facing method surface: + +- top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert` +- `dag`: `list`, `get`, `checkout`, `delete` +- `admin.index`: `list`, `get`, `delete` +- `admin.cache`: `invalidate` +- `admin.remote`: `list`, `gc` +- `admin`: `gc` +- `runtime`: `create`, `describe`, `put_literal`, `put_import`, `start_fn`, `commit` +- `config`: `get`, `set`, `show` +- `ops`: `commit`, `head`, `dag`, `node`, `index`, `cache`, `remote`, `gc`, `config` + +#### Scenario: Top-level repository methods are present +- **WHEN** a caller inspects the shared `Dml` class +- **THEN** the repository porcelain workflows are available on the top level rather than through raw subsystem factories + +#### Scenario: DAG, admin, runtime, and config methods remain namespaced +- **WHEN** a caller needs DAG inspection, admin maintenance, runtime staging behavior, or config access +- **THEN** the shared `Dml` exposes those methods under `dag`, `admin`, `runtime`, and `config` namespaces respectively + +#### Scenario: Exact subsystem objects are grouped under ops +- **WHEN** a caller needs direct exact-input subsystem behavior such as `CommitOps`, `HeadOps`, or `IndexOps` +- **THEN** the shared `Dml` exposes those objects under `dml.ops.*` rather than as direct top-level `Dml` attributes + +### Requirement: `Dml` stores only runtime context and temporary-directory bookkeeping +The shared `Dml` class SHALL keep only `_context` and `_tempdirs` as private instance attributes. + +#### Scenario: Namespace and helper access do not require extra Dml instance fields +- **WHEN** a caller uses any public namespace on `Dml` +- **THEN** the namespace behavior is derived from `_context`, `_tempdirs`, and delegated helper logic without introducing additional private `Dml` instance attributes + +### Requirement: `Dml` is the only fuzzy-selector boundary +The shared `Dml` class SHALL accept only the fuzzy selector forms already specified by the redesigned CLI contracts and SHALL resolve those forms internally before invoking lower-level operations. + +#### Scenario: Revision selector resolves inside Dml +- **WHEN** a caller passes a supported revision string such as `HEAD~1` to a shared `Dml` repository method +- **THEN** the `Dml` method resolves it through the fuzzy-resolution submodule and lower-level ops receive only exact values + +#### Scenario: DAG selector resolves inside Dml +- **WHEN** a caller passes `train` or `dag:abc123` to `dml.dag.get` +- **THEN** the shared `Dml` method performs the selector-mode handling through the fuzzy-resolution submodule and lower-level ops do not parse that caller-facing form + +#### Scenario: Unsupported fuzzy grammar is rejected at Dml boundary +- **WHEN** a caller passes a selector form that is not documented by the redesigned CLI contracts +- **THEN** the shared `Dml` method fails rather than inventing additional grammar + +### Requirement: Lower-level ops classes accept resolved values only +Lower-level ops classes used by `Dml` SHALL accept exact refs, exact branch names, exact ids, and other resolved repository values rather than caller-facing fuzzy selectors or config-shaped overrides. + +#### Scenario: Commit workflow uses exact values below Dml +- **WHEN** a shared `Dml` method invokes commit/head workflow behavior +- **THEN** the lower-level ops calls receive already-resolved commits, branches, or ids instead of revision grammar strings + +### Requirement: `Dml` delegates repository behavior to the relevant ops classes +The shared `Dml` class SHALL orchestrate workflows by delegating repository actions to the relevant subsystem ops classes rather than re-implementing those mechanics inline. + +#### Scenario: Commit-oriented workflow delegates to CommitOps +- **WHEN** a caller invokes `dml.show`, `dml.log`, `dml.diff`, `dml.merge`, or `dml.revert` +- **THEN** `Dml` delegates the relevant repository operations to `CommitOps` after preparing resolved inputs + +#### Scenario: Runtime workflow delegates to IndexOps +- **WHEN** a caller invokes `dml.runtime.create`, `dml.runtime.put_literal`, `dml.runtime.start_fn`, or `dml.runtime.commit` +- **THEN** `Dml` delegates the relevant repository operations to `IndexOps` after preparing resolved inputs + +#### Scenario: Admin workflow delegates to the owning subsystem +- **WHEN** a caller invokes an admin cache, remote, or gc workflow +- **THEN** `Dml` delegates the repository action to `CacheOps`, `RemoteOps`, or `GcOps` respectively after preparing resolved inputs + +### Requirement: Shared `Dml` returns JSON-ready payloads +Shared `Dml` methods SHALL return JSON-ready dict/list payloads for container structure, while allowing typed leaves such as `Ref`, `Uri`, `Error`, and `Runnable`. + +#### Scenario: CLI-ready result shape comes from Dml +- **WHEN** a caller invokes a shared `Dml` repository or admin workflow +- **THEN** the returned payload is ready for JSON serialization without CLI-owned result reshaping beyond standard typed-leaf encoding + +### Requirement: Repository bootstrap and recovery are available through shared `Dml` +Repository bootstrap and recovery workflows SHALL be available through the shared `Dml` boundary. + +#### Scenario: Init and recovery use Dml-owned entrypoint +- **WHEN** a caller invokes repository bootstrap or recovery behavior +- **THEN** the workflow executes through a `Dml` entrypoint and preserves the documented config-first recovery semantics diff --git a/openspec/changes/archive/2026-05-10-unify-dml-boundary/tasks.md b/openspec/changes/archive/2026-05-10-unify-dml-boundary/tasks.md new file mode 100644 index 0000000..ba54c93 --- /dev/null +++ b/openspec/changes/archive/2026-05-10-unify-dml-boundary/tasks.md @@ -0,0 +1,38 @@ +## 1. Landed groundwork + +- [x] 1.1 Expand `src/daggerml/_internal/__init__.py` so the planned shared boundary, helper functions, and ops exports can be reached from one `_internal` surface. +- [x] 1.2 Add `src/daggerml/_internal/dml_context.py` to centralize resolved runtime/project context helpers such as config lookup, branch/default selection, project home checks, and recovery helpers. +- [x] 1.3 Start routing selected `_internal` modules through the shared export surface so the future boundary can depend on one import layer. + +## 2. Build the shared `Dml` shell + +- [ ] 2.1 Add `src/daggerml/_internal/dml.py` with the shared `Dml` shell, storing only `_context` and `_tempdirs` plus context-manager lifecycle methods. +- [ ] 2.2 Add private helper stubs on `Dml` for delegated ops access, selector resolution, runtime branch lookup, and S3 client creation without exposing extra top-level public attributes. + +## 3. Add namespace scaffolding in dependency order + +- [ ] 3.1 Add the public `ops` namespace exposing exact subsystem objects under `dml.ops.commit`, `head`, `dag`, `node`, `index`, `cache`, `remote`, `gc`, and `config`. +- [ ] 3.2 Add the public `config` namespace with `get`, `set`, and `show`. +- [ ] 3.3 Add the public `runtime` namespace with `create`, `describe`, `put_literal`, `put_import`, `start_fn`, and `commit`. + +- [ ] 3.4 Add `src/daggerml/_internal/dml_resolution.py` with the revision and DAG-selector helpers already referenced by `_internal.__init__.py`. +- [ ] 3.5 Add the public `dag` namespace with `list`, `get`, `checkout`, and `delete` on top of `dml_resolution` plus delegated ops. +- [ ] 3.6 Add the public `admin` namespace with `index.list|get|delete`, `cache.invalidate`, `remote.list|gc`, and `gc`. + +## 4. Add top-level porcelain workflows + +- [ ] 4.1 Implement read-oriented porcelain first: `status`, `log`, `show`, `diff`, and `branch`. +- [ ] 4.2 Implement mutating/sync porcelain next: `checkout`, `fetch`, `pull`, `push`, `merge`, and `revert`. + +## 5. Add bootstrap and recovery flows + +- [ ] 5.1 Add `Dml.create` and `Dml.temporary` on top of the shared shell and delegated subsystem helpers. +- [ ] 5.2 Put `Dml.init` on the new shared class while preserving the config-first recovery semantics already captured in `dml_context`. + +## 6. Finalize `_internal` wiring and verify the narrowed scope + +- [ ] 6.1 Wire `_internal.__init__.py` to export only implemented modules and remove the current broken references to missing `_internal.dml` and `_internal.dml_resolution` modules. +- [ ] 6.2 Keep `DmlOps` and other compatibility surfaces untouched unless a minimal `_internal` wiring change is required to make the shared boundary import cleanly. +- [ ] 6.3 Run focused import and contract coverage for `daggerml._internal.Dml`, `Dml.init`, and the delegated namespace surface. +- [ ] 6.4 Verify the corrected public boundary shape: `Dml` stores only `_context` and `_tempdirs`, and exposes exact subsystem objects only under `dml.ops.*`. +- [ ] 6.5 Confirm the remaining implementation only touches a few files under `src/daggerml/_internal/`; if more churn is required, capture that as a follow-up change instead of expanding this one. diff --git a/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/.openspec.yaml b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/.openspec.yaml new file mode 100644 index 0000000..81cd71f --- /dev/null +++ b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-11 diff --git a/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/design.md b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/design.md new file mode 100644 index 0000000..0333200 --- /dev/null +++ b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/design.md @@ -0,0 +1,60 @@ +## Context + +Codec logic is currently split across three places: registry and traversal in `daggerml._internal.codec`, `NodeCodec` in `daggerml.api`, and delayed-action codec behavior in `daggerml.contrib.api`. This makes `_internal` responsible for behavior that depends on public wrapper types and contrib-facing delayed values. + +The change is intentionally staged. Stage 1 is a relocation only: `src/daggerml/codecs.py` becomes the single home for codec code, while `_internal` continues to call codecs exactly as it does today through `CodecContext`. Stage 2 then changes ownership and contract: codecs receive `Dag`, recursive traversal moves to `daggerml.api.Dag`, and `_internal` stops normalizing values. + +## Goals / Non-Goals + +**Goals:** +- Establish `src/daggerml/codecs.py` as the only module that contains codec logic and codec types. +- Preserve current behavior during Stage 1, including plugin loading, `Node` handling, delayed-action handling, and `_internal` call sites. +- In Stage 2, move recursive normalization and insertion ownership to `daggerml.api.Dag`. +- Preserve the `daggerml.codecs` plugin entry-point group across both stages. + +**Non-Goals:** +- Redesign codec matching, priority, or convergence semantics. +- Remove `Node` as a codec. +- Introduce a second plugin system or new codec registration surface. +- Change adapter execution, DAG storage format, or non-codec staging semantics. + +## Decisions + +### Create one codec module at `src/daggerml/codecs.py` +All codec code moves into `daggerml.codecs`: registry, plugin loading, codec protocol, built-in codecs, delayed-action value types, and traversal helpers. This gives both stages a single implementation target and removes split ownership across public, contrib, and internal modules. + +Alternative considered: keep built-in codecs in `api.py` and `contrib/api.py` while only moving the registry. Rejected because it would preserve the same ownership split that this change is trying to remove. + +### Use a codec-local error type in Stage 1 +Stage 1 avoids importing `_internal.types` from `daggerml.codecs`. Codec failures therefore raise a codec-local exception type, and `_internal` translates that exception back into repository-domain errors at its boundary. This keeps Stage 1 as a pure extraction while preserving outward behavior. + +Alternative considered: keep `daggerml.codecs` dependent on `_internal.types.DmlRepoError`. Rejected because it would leave the new module coupled to `_internal`, making Stage 2 harder. + +### Keep `CodecContext` only for Stage 1 +Stage 1 keeps the existing contract so `_internal` call sites do not change behavior. Stage 2 removes `CodecContext` entirely and passes `Dag` into codecs. This separates extraction from behavior change and reduces migration risk. + +Alternative considered: switch directly to `Dag` during extraction. Rejected because it would mix module relocation with behavioral changes in call-site ownership. + +### Make `Dag` own recursive normalization in Stage 2 +In Stage 2, one `Dag`-owned helper recursively walks values, applies codecs, preserves or imports nodes, and prepares values for runtime staging. `Dag.put` and `Dag.call` use this helper before delegating to runtime methods. `Dag.call` inserts the callable and all arguments before continuing with execution. + +Alternative considered: keep recursion in `_internal` and only change the codec argument from `CodecContext` to `Dag`. Rejected because traversal ownership is the core layering problem. + +## Risks / Trade-offs + +- [Import-cycle risk during Stage 1] -> Move delayed-action types and built-in codecs into `daggerml.codecs` together rather than splitting them across modules. +- [Behavior drift between Stage 1 and Stage 2] -> Treat Stage 1 as a no-semantics-change extraction and cover current codec behavior with tests before changing ownership. +- [Error-surface mismatch after introducing codec-local errors] -> Translate codec-local failures to repository-domain failures at `_internal` call sites until Stage 2 removes those boundaries. +- [Plugin breakage when the encode contract changes to `Dag`] -> Keep the entry-point group stable, document the new argument contract clearly, and migrate built-in codecs first. + +## Migration Plan + +1. Stage 1: create `src/daggerml/codecs.py`, move all codec logic there, and update `_internal` to import codec symbols from that module while preserving `CodecContext` call sites. +2. Stage 1: introduce codec-local errors and translate them to repository-domain errors at `_internal` boundaries. +3. Stage 2: move recursive normalization into `daggerml.api.Dag` and change codec `encode(...)` to receive `Dag`. +4. Stage 2: remove `_internal` codec traversal, remove `CodecContext`, and update plugin and built-in codec implementations to the `Dag` contract. + +## Open Questions + +- Whether delayed-action helper types continue to be re-exported from `daggerml.contrib.api` after moving their implementation into `daggerml.codecs`. +- Whether Stage 2 should keep a separate public helper for codec-driven insertion or keep it fully internal to `Dag` methods. diff --git a/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/proposal.md b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/proposal.md new file mode 100644 index 0000000..33968db --- /dev/null +++ b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/proposal.md @@ -0,0 +1,24 @@ +## Why + +Codec behavior is currently split across `daggerml._internal`, `daggerml.api`, and `daggerml.contrib.api`, which makes the ownership boundary unclear and couples internal staging code to public wrapper concerns. We want a staged migration that first centralizes codec code in one public module, then moves the codec contract and traversal ownership to `Dag` without changing plugin extensibility. + +## What Changes + +- Add a single codec module at `src/daggerml/codecs.py` that owns codec registration, plugin loading, codec types, and built-in codec implementations. +- Move all existing codec logic out of `daggerml._internal.*`, `daggerml.api`, and `daggerml.contrib.api` into `daggerml.codecs`. +- Stage 1: preserve the current runtime contract by continuing to call codecs from `_internal` with `CodecContext`, while translating codec-local errors back to repository-domain errors at the `_internal` boundary. +- Stage 2: change the codec contract so codecs receive `daggerml.api:Dag`, move recursive codec traversal and insertion ownership into `Dag` methods, and remove `CodecContext` entirely. +- Keep `Node` as a built-in codec, keep plugin discovery under the `daggerml.codecs` entry-point group, and update `Dag.call` to insert callable and argument values before invoking runtime execution. + +## Capabilities + +### New Capabilities +- `codec-normalization`: Defines the codec module boundary, built-in codec behavior, plugin contract, and staged migration of codec traversal from `_internal` to `daggerml.api.Dag`. + +### Modified Capabilities + +## Impact + +- Affected code: `src/daggerml/codecs.py`, `src/daggerml/api.py`, `src/daggerml/contrib/api.py`, `src/daggerml/_internal/__init__.py`, `src/daggerml/_internal/ops/index.py`, and codec-related tests. +- Affected APIs: codec plugin `encode(...)` contract, internal codec error translation, and `Dag`-owned staging/insert behavior. +- Affected packaging: the `daggerml.codecs` plugin entry-point group remains in place but now targets the unified codec module. diff --git a/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/specs/codec-normalization/spec.md b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/specs/codec-normalization/spec.md new file mode 100644 index 0000000..def0d9e --- /dev/null +++ b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/specs/codec-normalization/spec.md @@ -0,0 +1,65 @@ +## ADDED Requirements + +### Requirement: Codec logic has a single owning module +The system SHALL define `src/daggerml/codecs.py` as the only module that contains codec logic, codec types, codec registry behavior, plugin loading behavior, and built-in codec implementations. + +#### Scenario: Internal callers import codec behavior from the unified module +- **WHEN** internal staging code needs codec registration or codec application behavior +- **THEN** it imports that behavior from `daggerml.codecs` +- **AND** `daggerml._internal.*` does not define codec logic of its own + +#### Scenario: Built-in codecs live in the unified module +- **WHEN** the system provides built-in codec behavior for `Node` values or delayed-action values +- **THEN** those codec implementations are defined in `daggerml.codecs` + +### Requirement: Stage 1 preserves current codec call semantics +During Stage 1, the system SHALL continue to invoke codec behavior from internal staging call sites using `CodecContext`, while sourcing that behavior from `daggerml.codecs`. + +#### Scenario: Literal staging still applies codecs through internal call sites +- **WHEN** `_internal` literal staging normalizes a value during Stage 1 +- **THEN** it applies codecs through `daggerml.codecs` +- **AND** it passes `CodecContext` to codec `encode(...)` + +#### Scenario: Function staging still applies codecs through internal call sites +- **WHEN** `_internal` function staging normalizes argv or kwargv values during Stage 1 +- **THEN** it applies codecs through `daggerml.codecs` +- **AND** it passes `CodecContext` to codec `encode(...)` + +#### Scenario: Codec-local failures are translated at the internal boundary +- **WHEN** codec application fails during Stage 1 +- **THEN** `daggerml.codecs` raises a codec-local error type +- **AND** the `_internal` caller translates that failure into the repository-domain error surface it already exposes + +### Requirement: Stage 2 codecs receive Dag instances +During Stage 2, the codec plugin contract SHALL pass `daggerml.api.Dag` into codec `encode(...)` instead of `CodecContext`. + +#### Scenario: Built-in codec receives Dag +- **WHEN** a built-in codec encodes a value during Stage 2 +- **THEN** its `encode(...)` method receives the active `Dag` instance + +#### Scenario: Plugin codec receives Dag +- **WHEN** a plugin codec loaded from the `daggerml.codecs` entry-point group encodes a value during Stage 2 +- **THEN** its `encode(...)` method receives the active `Dag` instance + +### Requirement: Dag owns recursive codec normalization in Stage 2 +During Stage 2, `daggerml.api.Dag` SHALL own recursive codec normalization and insertion for values accepted by public staging and call-entry methods. + +#### Scenario: Dag.put normalizes recursively before runtime staging +- **WHEN** `Dag.put(value)` is called during Stage 2 +- **THEN** `Dag` recursively applies codecs and normalizes nested values before delegating to runtime literal staging + +#### Scenario: Dag.call inserts callable and arguments before execution +- **WHEN** `Dag.call(fn, *args, **kwargs)` is called during Stage 2 +- **THEN** `Dag` inserts the callable, positional arguments, and keyword argument values through the codec-driven normalization path before invoking runtime function staging + +#### Scenario: Node remains a codec during Dag-owned normalization +- **WHEN** a `Node` value is encountered during Stage 2 normalization +- **THEN** the system handles it through the built-in `Node` codec rather than through a special non-codec rule + +### Requirement: Codec plugins remain discoverable through the existing entry-point group +The system SHALL continue to load codec plugins from the `daggerml.codecs` entry-point group across both migration stages. + +#### Scenario: Entry-point group remains stable +- **WHEN** codec plugins are discovered after this change +- **THEN** discovery uses the `daggerml.codecs` entry-point group +- **AND** plugin loading preserves deterministic ordering and re-encode behavior diff --git a/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/tasks.md b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/tasks.md new file mode 100644 index 0000000..84e2849 --- /dev/null +++ b/openspec/changes/archive/2026-05-12-migrate-codecs-to-daggerml-codecs/tasks.md @@ -0,0 +1,19 @@ +## 1. Stage 1 Extraction + +- [x] 1.1 Create `src/daggerml/codecs.py` and move codec registry, plugin loading, `CodecContext`, and built-in codec implementations into it. +- [x] 1.2 Move delayed-action codec types and behavior into `daggerml.codecs`, and update `daggerml.api`, `daggerml.contrib.api`, and `_internal` imports to consume codec symbols from that module. +- [x] 1.3 Introduce a codec-local error type in `daggerml.codecs` and translate codec failures back to repository-domain errors at `_internal` call sites. +- [x] 1.4 Update codec-related tests to confirm Stage 1 preserves current behavior while removing codec logic from `daggerml._internal.*`. + +## 2. Stage 2 Contract Migration + +- [x] 2.1 Change the codec contract so built-in codecs and plugin codecs receive `daggerml.api.Dag` instead of `CodecContext`. +- [x] 2.2 Implement a `Dag`-owned recursive codec normalization and insertion helper, and use it from `Dag.put`. +- [x] 2.3 Update `Dag.call` to insert the callable, positional arguments, and keyword argument values through the codec-driven normalization path before runtime execution. +- [x] 2.4 Remove codec traversal and `CodecContext` usage from `_internal` runtime staging paths once `Dag` owns normalization. + +## 3. Validation And Cleanup + +- [x] 3.1 Update codec and API documentation to reflect the unified codec module and the Stage 2 `Dag` contract. +- [x] 3.2 Verify plugin discovery still works through the `daggerml.codecs` entry-point group after both stages. +- [x] 3.3 Run the relevant codec, API, and contrib test coverage for both stages and fix any regressions. diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/.openspec.yaml b/openspec/changes/archive/2026-05-13-remove-casts-to-any/.openspec.yaml new file mode 100644 index 0000000..93831bd --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-13 diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/design.md b/openspec/changes/archive/2026-05-13-remove-casts-to-any/design.md new file mode 100644 index 0000000..706eade --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/design.md @@ -0,0 +1,40 @@ +## Context + +The current `cast(..., Any)` sites cluster in two places. In `daggerml.contrib.api`, they wrap dynamic class metadata writes, `__init__` wrapping, and dagclass member staging/calling. In the execution runtime, they erase the concrete execution-status type when building or merging `ExecutionRecord` dictionaries. Tests mirror the same pattern by forcing values through `Any` even when the cast changes nothing. + +This change should stay as small as possible: remove the no-op `cast(..., Any)` calls, keep the surrounding behavior intact, and only make additional local edits if removing a cast exposes a real issue. + +## Goals / Non-Goals + +**Goals:** +- Remove every current `cast(..., Any)` occurrence from source and tests. +- Preserve current runtime behavior for `api.dagclass`, `api.run`, `api.funkify`, and execution-record persistence. +- Keep the implementation small and local to the affected modules. + +**Non-Goals:** +- Redesign `api.dagclass`, `api.run`, or adapter execution semantics. +- Broaden the change into a full repo-wide typing cleanup beyond the current `cast(..., Any)` sites. +- Introduce helper abstractions, compatibility layers, or alternate code paths just to compensate for removing `cast(..., Any)`. + +## Decisions + +### Remove the `Any` casts directly +The implementation should delete each `cast(..., Any)` call and keep the surrounding expression as-is whenever that remains valid. The cast is a type-checking no-op at runtime, so the default approach is simple removal rather than replacement. + +Alternative considered: replace removed casts with helper typing layers. Rejected because that adds scope without serving the stated goal. + +### Only make local follow-up edits when deletion alone is insufficient +If deleting a cast causes a concrete type-check or test failure, the implementation should fix that exact line in the smallest possible way. The change should not expand into broader typing refactors. + +Alternative considered: widen signatures or add new helper APIs. Rejected because it spreads or grows the change unnecessarily. + +### Keep test cleanup equally direct +Tests should stop using `cast(..., Any)` and instead pass the concrete value directly unless a specific test requires a different minimal local adjustment. + +Alternative considered: leave test-only `Any` casts in place. Rejected because the cleanup should apply everywhere. + +## Risks / Trade-offs + +- [Some sites may not type-check after raw cast removal] -> Make the smallest possible local fix only where needed. +- [Dynamic class mutation is awkward under static typing] -> Do not preemptively abstract it; only touch the exact lines that break. +- [No runtime behavior change means regressions could be subtle] -> Verify with the focused contrib and execution-state test coverage that already exercises these paths. diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/proposal.md b/openspec/changes/archive/2026-05-13-remove-casts-to-any/proposal.md new file mode 100644 index 0000000..5e6d36a --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/proposal.md @@ -0,0 +1,23 @@ +## Why + +The codebase currently relies on `cast(Any, ...)` in a handful of runtime and test paths to silence the type checker instead of expressing the real types. That makes the type surface harder to trust, hides legitimate typing mistakes, and has already spread into core contrib and execution-state code. + +## What Changes + +- Remove all current `cast(..., Any)` usages from runtime and test code. +- Leave the surrounding runtime logic unchanged unless removing the cast exposes a real typing or test issue that must be fixed locally. +- Update affected tests so they exercise the same behavior without routing values through `cast(..., Any)`. + +## Capabilities + +### New Capabilities +- `cast-free-authoring-and-tests`: Contrib authoring helpers and tests no longer contain `cast(..., Any)` no-ops. + +### Modified Capabilities +- `runtime-execution-records`: Execution record construction and merge logic use the concrete runtime status type directly instead of erasing it through `Any`. + +## Impact + +- Affected code: `src/daggerml/contrib/api.py`, `src/daggerml/_internal/ops/index.py`, `src/daggerml/_internal/exec_state.py`, and tests covering contrib and configuration contracts. +- Affected systems: contrib dagclass compilation/run helpers, runtime execution-record persistence, and type-checked test coverage. +- No intended runtime behavior changes; this is a direct code cleanup. diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/cast-free-authoring-and-tests/spec.md b/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/cast-free-authoring-and-tests/spec.md new file mode 100644 index 0000000..0d1d2d7 --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/cast-free-authoring-and-tests/spec.md @@ -0,0 +1,19 @@ +## ADDED Requirements + +### Requirement: Contrib authoring helpers SHALL not use `cast(..., Any)` no-ops +The system SHALL preserve the current `api.dagclass`, `api.run`, and `api.funkify` behavior without using `cast(..., Any)` in their implementation. + +#### Scenario: Dagclass decoration still works after cast removal +- **WHEN** `api.dagclass` decorates and runs a class that previously passed through `cast(..., Any)` sites +- **THEN** the existing decoration, compilation, and runtime behavior remain unchanged + +#### Scenario: Funkify and dag staging still work after cast removal +- **WHEN** contrib runnable values are staged through the existing `api.funkify` and DAG execution flow +- **THEN** the same runtime results are produced without routing those values through `cast(..., Any)` + +### Requirement: Tests SHALL not use `cast(..., Any)` no-ops +The test suite SHALL validate contrib and configuration behavior without using `cast(..., Any)` to pass values through unchanged. + +#### Scenario: Invalid funkify input remains rejected without `Any` casts +- **WHEN** test coverage passes a concrete invalid input to `api.funkify` +- **THEN** the API still raises the existing invalid-input repository error diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..6f0cc86 --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/specs/runtime-execution-records/spec.md @@ -0,0 +1,12 @@ +## ADDED Requirements + +### Requirement: Execution record status typing SHALL remain concrete during runtime updates +The runtime SHALL preserve the declared execution-status literal type from adapter results through execution-record creation and execution-record merge operations. The implementation SHALL NOT erase `ExecutionRecord["status"]` through `cast(..., Any)` when persisting or merging runtime execution state. + +#### Scenario: First execution record uses the adapter result status directly +- **WHEN** `IndexOps.start_fn` constructs an execution record from a valid adapter result +- **THEN** the record stores the concrete runtime status value without erasing it through `Any` + +#### Scenario: Merge preserves the higher-ranked status without type erasure +- **WHEN** execution-record merge logic chooses between current and incoming statuses +- **THEN** it keeps the higher-ranked concrete status value and returns an `ExecutionRecord` whose `status` remains within the declared runtime status set diff --git a/openspec/changes/archive/2026-05-13-remove-casts-to-any/tasks.md b/openspec/changes/archive/2026-05-13-remove-casts-to-any/tasks.md new file mode 100644 index 0000000..5eeb837 --- /dev/null +++ b/openspec/changes/archive/2026-05-13-remove-casts-to-any/tasks.md @@ -0,0 +1,15 @@ +## 1. Contrib Authoring Typing + +- [x] 1.1 Remove `cast(..., Any)` from `src/daggerml/contrib/api.py` while keeping the current dagclass, run, and funkify behavior unchanged. +- [x] 1.2 Make only the smallest local follow-up edits needed if raw cast removal exposes a concrete type or test failure in contrib authoring paths. + +## 2. Execution Record Typing + +- [x] 2.1 Remove `cast(..., Any)` from execution-record construction in `src/daggerml/_internal/ops/index.py`. +- [x] 2.2 Remove `cast(..., Any)` from execution-record merge logic in `src/daggerml/_internal/exec_state.py`, making only minimal local fixes if required. + +## 3. Test Cleanup And Verification + +- [x] 3.1 Remove `cast(..., Any)` usage from contrib integration tests by passing concrete values directly and making only minimal local fixes if required. +- [x] 3.2 Remove `cast(..., Any)` usage from config contract tests while preserving the legacy-alias rejection assertion. +- [x] 3.3 Run the focused test coverage for contrib integration and config/internal execution-state paths, and confirm no `cast(..., Any)` usages remain. diff --git a/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/.openspec.yaml b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/.openspec.yaml new file mode 100644 index 0000000..66dd08a --- /dev/null +++ b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-14 diff --git a/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/design.md b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/design.md new file mode 100644 index 0000000..9e06d7c --- /dev/null +++ b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/design.md @@ -0,0 +1,104 @@ +## Context + +`src/daggerml/_internal/dml.py` currently exposes a clean public calling surface, but its implementation is organized around a large private helper-method layer on `Dml` itself. Namespace classes mostly route through `._dml`, yet `_DagNamespace` still carries a private helper method of its own. The result is that callers see one orchestration boundary while the implementation relies on a second, informal instance-level private API inside the same file. + +The requested change is narrower than a public API redesign. `Dml` may continue to keep `_context` and `_tempdirs` as private state, and namespace instances may continue to keep `._dml`. The goal is to move helper behavior out of private methods and into file-level functions so the `Dml` instance boundary is simpler and more explicit. + +## Goals / Non-Goals + +**Goals:** +- Remove private helper methods from `Dml`. +- Remove private helper methods and extra private attrs from namespace classes, leaving only `._dml` on namespaces. +- Re-home helper behavior in module-level functions within `src/daggerml/_internal/dml.py`. +- Preserve existing caller-facing `Dml`, `dml.dag`, `dml.runtime`, `dml.admin`, and `dml.config` behavior. +- Keep the change mostly mechanical so test updates can focus on structure rather than semantics. + +**Non-Goals:** +- Renaming `Dml._context` or `Dml._tempdirs`. +- Redesigning `Dml` public methods, namespace names, payload formats, or revision grammar. +- Moving helper logic into other modules unless an existing imported helper already owns that concern. +- Performing unrelated cleanup in `daggerml.api`, CLI modules, or lower-level ops classes. + +## Decisions + +### Decision: Replace `Dml` private helper methods with module-level helper functions +Helper behaviors currently implemented as `Dml._...` methods will move to top-level functions in `dml.py` that accept the `Dml` instance explicitly when needed. + +This includes: + +- ops acquisition and dispatch helpers +- payload shaping helpers +- revision-resolution wrappers that bind the current runtime context +- remote/S3 helper setup + +Rationale: + +- keeps `Dml` itself limited to state and public workflows +- makes helper dependencies explicit through function arguments instead of implicit `self` +- matches the requested architectural rule without changing behavior ownership + +Alternatives considered: + +- Keep private methods and only rename them: rejected because it does not change the architectural shape. +- Move helpers into a new module: rejected because the request specifically prefers functions defined in `dml.py` and the current helpers are tightly local to this file. + +### Decision: Keep private state exceptions exactly where requested +`Dml` will continue to store `_context` and `_tempdirs`, and namespace dataclasses will continue to store `._dml`. No other private attrs will be added to those objects. + +Rationale: + +- respects the explicit boundary the change is meant to enforce +- avoids churn in tests and call sites that already inspect `_context` + +Alternatives considered: + +- Make `context`/`tempdirs` public: rejected because the user explicitly narrowed the change away from that. + +### Decision: Namespace methods delegate only through module-level helpers plus `._dml` +Namespace methods will stop calling private `Dml` methods. Instead, they will call file-level helpers such as revision resolvers, ops accessors, payload builders, or simple utility functions. + +Rationale: + +- removes the second-layer private API from `Dml` +- keeps namespace objects thin and declarative + +Alternatives considered: + +- Let namespace methods inline all helper logic: rejected because it would duplicate orchestration details and make the file harder to maintain. + +### Decision: Keep `_OpsProxy` as an implementation detail only if it remains the smallest clean mechanism +The contract for this change is about `Dml` and namespace private methods/attrs, not every private symbol in the file. If an `_OpsProxy`-style helper remains the smallest way to keep ops lifetimes and call syntax stable, it may stay as a file-local implementation detail. If direct helper functions are clearer, it may be removed during implementation. + +Rationale: + +- preserves flexibility during the refactor +- keeps the spec focused on the actual architectural boundary the user cares about + +Alternatives considered: + +- Require `_OpsProxy` removal in the design: rejected because it is not necessary to satisfy the requested boundary rule. + +## Risks / Trade-offs + +- [Risk] Mechanical call rewrites accidentally change which ops helper is used or when an ops handle is opened/closed -> Mitigation: preserve existing helper responsibilities one-for-one first, then simplify only after tests pass. +- [Risk] Helper extraction could introduce naming collisions with imported resolver functions such as `resolve_revision` -> Mitigation: use distinct helper names that communicate Dml-bound context explicitly. +- [Risk] Structural contract tests may lag behind the new boundary and fail despite behavior remaining correct -> Mitigation: update tests in the same change to assert the new no-private-helper rule directly. +- [Trade-off] Module-level helper functions are less encapsulated than instance-private methods -> Mitigation: keep them file-local and narrowly scoped to `dml.py`. + +## Migration Plan + +1. Inventory all `Dml._...` helper methods and group them by role: ops dispatch, payload building, revision resolution, and remote setup. +2. Introduce equivalent module-level helper functions in `dml.py` with explicit `dml` parameters where state/context is required. +3. Rewrite namespace methods and `Dml` public methods to call the new helper functions. +4. Remove the replaced private helper methods from `Dml` and the private helper method from `_DagNamespace`. +5. Update structural tests to validate the new boundary while preserving existing behavioral assertions. +6. Run targeted tests for `dml`, CLI-facing `Dml` workflows, and contract suites that inspect the shared boundary. + +Rollback strategy: + +- Revert the helper extraction as one change if behavior or lifecycle regressions appear, then re-apply with tighter test coverage around the affected helper family. + +## Open Questions + +- Should the implementation keep `_OpsProxy` as a file-local helper or replace it with direct dispatch helpers everywhere? +- Should tests assert an explicit allowlist of remaining private attrs/methods on `Dml` and namespaces, or only assert the absence of the removed helper names? diff --git a/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/proposal.md b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/proposal.md new file mode 100644 index 0000000..9583b8f --- /dev/null +++ b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/proposal.md @@ -0,0 +1,25 @@ +## Why + +`src/daggerml/_internal/dml.py` currently uses a large private helper-method layer on `Dml`, and `_DagNamespace` still exposes a private helper method. That makes the caller-facing `Dml` boundary harder to reason about because orchestration logic is split between public methods and an informal private instance API. + +## What Changes + +- Remove private helper methods from `daggerml._internal.dml:Dml` and replace them with module-level functions in `dml.py`. +- Keep `Dml` private instance state limited to `_context` and `_tempdirs`. +- Keep namespace private instance state limited to `._dml` and remove any remaining private namespace helper methods. +- Update `Dml` public methods and namespace methods to delegate through module-level helper functions instead of `self._...` helper methods. +- Preserve existing public runtime, DAG, admin, config, and repository behavior; this is an internal boundary cleanup, not a user-facing feature change. + +## Capabilities + +### New Capabilities + +### Modified Capabilities +- `unified-dml-surface`: tighten the internal `Dml` boundary so private state remains limited to `_context` and `_tempdirs`, while helper logic lives at module scope and namespaces only retain `._dml` as private state. + +## Impact + +- Affected code: `src/daggerml/_internal/dml.py` and tests that assert `Dml`/namespace structure directly. +- Affected APIs: internal `Dml` implementation shape and namespace implementation shape; no intended change to documented caller-facing methods. +- Tests: internal contract tests and any tests that depend on private helper methods or namespace helper structure. +- Systems: improves the clarity of the shared `Dml` orchestration boundary without changing repository semantics. diff --git a/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..f3f61fc --- /dev/null +++ b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/specs/unified-dml-surface/spec.md @@ -0,0 +1,17 @@ +## MODIFIED Requirements + +### Requirement: `Dml` stores only runtime context and temporary-directory bookkeeping +The shared `Dml` class SHALL keep only `_context` and `_tempdirs` as private instance attributes. Helper behavior that supports `Dml` public methods SHALL live in module-level functions within `daggerml._internal.dml` rather than in private `Dml` instance methods. + +#### Scenario: Namespace and helper access do not require extra Dml instance fields +- **WHEN** a caller uses any public namespace on `Dml` +- **THEN** the namespace behavior is derived from `_context`, `_tempdirs`, and delegated helper logic without introducing additional private `Dml` instance attributes + +#### Scenario: Dml public workflows do not depend on private helper methods +- **WHEN** a `Dml` repository, runtime, DAG, admin, or config workflow needs helper behavior such as ops dispatch, payload shaping, or revision binding +- **THEN** that helper behavior executes through module-level functions in `daggerml._internal.dml` rather than through `Dml._...` instance methods + +#### Scenario: Namespace objects keep only Dml as private state +- **WHEN** a caller inspects the namespace objects exposed by `Dml` +- **THEN** each namespace object keeps only `._dml` as private instance state +- **AND** namespace helper behavior does not rely on additional private attrs or private helper methods on the namespace object diff --git a/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/tasks.md b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/tasks.md new file mode 100644 index 0000000..b736acf --- /dev/null +++ b/openspec/changes/archive/2026-05-14-remove-dml-private-helpers/tasks.md @@ -0,0 +1,17 @@ +## 1. Extract Dml helper functions + +- [x] 1.1 Inventory the current `Dml._...` helper methods in `src/daggerml/_internal/dml.py` and group them by responsibility (ops dispatch, payload shaping, revision binding, remote setup). +- [x] 1.2 Introduce equivalent module-level helper functions in `dml.py`, using explicit `dml` parameters wherever runtime state is required. +- [x] 1.3 Rewrite `Dml` public repository/bootstrap methods to use the new module-level helpers and remove the replaced private helper methods from `Dml`. + +## 2. Simplify namespace implementations + +- [x] 2.1 Rewrite runtime, DAG, admin, and config namespace methods to call module-level helpers instead of `self._dml._...` helper methods. +- [x] 2.2 Remove `_DagNamespace._stringify_node_selector` and replace it with a module-level utility. +- [x] 2.3 Confirm namespace instances retain only `._dml` as private state and do not introduce new private helper attrs or methods. + +## 3. Preserve behavior and structural contracts + +- [x] 3.1 Update tests that inspect `Dml` or namespace structure to assert the allowed remaining private attrs and the absence of removed private helper methods. +- [x] 3.2 Run targeted contract and integration tests covering `Dml` repository methods, namespace workflows, and any call paths sensitive to ops lifecycle behavior. +- [x] 3.3 Resolve any regressions without expanding the refactor scope beyond the documented boundary cleanup. diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/.openspec.yaml b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/.openspec.yaml new file mode 100644 index 0000000..9f70866 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-15 diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/design.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/design.md new file mode 100644 index 0000000..e23922f --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/design.md @@ -0,0 +1,66 @@ +## Context + +Current cancellation is planned from execution ids and stops after writing `cancel-requested` into execution state. That leaves the caller-facing API mismatched with user intent and leaves contrib-managed external work running until some later poll notices the control bit. The new flow starts from an index, treats that `index_id` as a synthetic execution root in the same `dml/exec/state/*` and `dml/exec/edges/*` namespace as normal executions, freezes that index by moving it to `indexes/.cancelled/.json` under lock, computes the rooted active execution closure from that frozen index root, and invokes executor-owned cancel behavior as a bounded sweep. + +The design must preserve two existing constraints: execution state remains the durable source of truth for execution status, and contrib adapters remain the transport layer that simply forwards `execution_status` and `cancel_requested_by` to executors. Executors, not adapters, own cleanup of external resources. + +## Goals / Non-Goals + +**Goals:** +- Add a caller-facing `Dml.runtime.cancel(index_id)` entrypoint. +- Persist index-root lineage in the same S3 execution state and edge namespaces as normal executions. +- Freeze an index atomically so no new descendants can be attached during cancellation. +- Plan cancellation from the frozen index root instead of from user-supplied execution ids. +- Recheck terminal state, live-caller ownership, and `cancel-requested` writes under a per-execution lock. +- Treat `cancel-requested` as an executor update step and let each executor tear down its own external resources. +- Keep cancellation bounded: once the sweep has run, remove the temporary cancelled-index marker. + +**Non-Goals:** +- Clearing or rewriting persisted execution-record `state` after cancellation. +- Introducing a long-lived cancellation orchestration state machine. +- Guaranteeing that already marked `cancel-requested` executions are reaped immediately if no adapter cycle is triggered. +- Changing adapters to own backend-specific cancellation logic. + +## Decisions + +### Index-rooted cancellation is the only caller-facing cancel API +`Dml.runtime.cancel(index_id)` is the new orchestration boundary. Users cancel work by naming the mutable index they own rather than execution ids discovered from internals. This matches the object users actually manipulate. + +Alternative considered: keep an execution-id-based cancel API and add a thin helper that resolves execution ids from an index. Rejected because it preserves the wrong ownership boundary and makes caller semantics depend on runtime internals. + +### A cancelled index is represented by an atomic move to `indexes/.cancelled/.json` +The runtime will lock the index, move the live index object to `indexes/.cancelled/.json`, and release the lock before planning the sweep. The moved object is a short-lived freeze marker, not a new persistent state machine. Any code that mutates indexes must treat the live path as absent once the move succeeds. + +Alternative considered: keep the index in place and add a sidecar tombstone. Rejected because two objects would need to be consulted to know whether the index is still mutable. + +### Indexes are synthetic execution roots in S3 state and edge storage +The runtime will persist an `exec/state/.json` object for each live index and will record rooted lineage from that index using the same canonical `exec/edges//.json` namespace used for execution-to-execution dependencies. This keeps rooted traversal, caller counting, and cancellation planning in one graph model rather than splitting lineage between local index pointers and remote execution records. + +Alternative considered: keep index lineage in a separate `indexes/`-specific graph namespace. Rejected because cancellation would need custom traversal and special-case caller counting for index roots. + +### Cancellation is a bounded sweep, not an eventual workflow +“Done” means the runtime completed one rooted cancellation sweep: it walked the active call graph from the frozen index, identified eligible executions, marked them `cancel-requested` under lock, invoked their adapters in cancel mode, and then removed the temporary cancelled-index marker. Some executions may remain in `cancel-requested` afterward; they are no longer active and can be reaped later. + +Alternative considered: keep the cancelled-index marker until every descendant becomes terminal. Rejected because it turns cancellation into a long-running coordinator and complicates rollback and observability. + +### Execution eligibility is decided under a per-execution lock +For each candidate id in the rooted set, the runtime acquires that id's lock, rereads current state, skips `succeeded`, `failed`, and `cancelled`, counts active callers while excluding `cancel-requested`, writes `cancel-requested` before any adapter cancel call, and adds that record's dependencies into the work set regardless of whether the adapter is invoked. If a caller resumes after the lock releases, it must create a new execution rather than attaching work to the cancelled one. + +Alternative considered: plan from one global snapshot without per-execution rechecks. Rejected because caller ownership races would make shared dependencies unsafe to cancel. + +### Executors handle cancellation according to whether update normally dispatches to `runnable.sub` +Executors fall into two groups: +- Update-dispatch executors such as `ssh` continue to call `runnable.sub` when `execution_status == "cancel-requested"`, then perform any executor-owned cleanup. +- Detached-work executors such as `batch`, `docker`, `script`, and `cfn` do not call `runnable.sub` on cancel updates; they cancel their own external resources directly and return quickly. + +This keeps adapters transport-only and preserves the runtime rule that executor behavior is backend-specific. + +### Cancel invocation success is operational success, not DAG success +`runtime.cancel` owns the S3/index/execution-state side effects. Executor cancel calls return a good operational result when the cancel update was processed without transport/runtime exceptions, even if the underlying job is being rolled back asynchronously. This is especially important for CloudFormation, where rollback must start quickly but finish later. + +## Risks / Trade-offs + +- [Executions can remain in `cancel-requested` after the sweep] → Treat `cancel-requested` as non-active and add a later reap path if needed. +- [Detached backends differ in what “cancel” means] → Keep the shared contract small and specify backend-specific teardown rules per executor. +- [CloudFormation rollback is asynchronous] → Return quickly with stack context and let backend progress continue outside the bounded sweep. +- [Short-lived cancelled-index marker reduces auditability] → Rely on execution state and normal logs for post-hoc inspection rather than keeping the freeze marker permanently. diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/proposal.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/proposal.md new file mode 100644 index 0000000..47b5fa0 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/proposal.md @@ -0,0 +1,31 @@ +## Why + +Cancellation is currently keyed by execution id and stops at recording `cancel-requested`, which does not match user intent or actually tear down contrib-managed work. We need index-rooted cancellation that freezes the index, walks the rooted live call graph, and lets executors perform their own cancel-time cleanup. + +## What Changes + +- Add `Dml.runtime.cancel(index_id)` as the caller-facing cancellation entrypoint. +- Treat each `index_id` as a synthetic execution root in `dml/exec/state/*` and `dml/exec/edges/*` so index lineage uses the same S3 graph model as runtime executions. +- Change cancellation planning to start from an index root instead of user-supplied execution ids. +- Atomically move the target index to `indexes/.cancelled/.json` under lock so the index is frozen during cancellation and cannot be modified further. +- Walk the rooted active execution graph for that cancelled index and mark eligible executions `cancel-requested` under per-execution locks after rechecking terminal state and live-caller ownership. +- Update contrib executors to treat `cancel-requested` as an update step and perform executor-owned teardown of external resources. +- Let executors that normally call `runnable.sub` on update continue doing so during cancellation; let executors that do not call `runnable.sub` cancel their own external jobs directly. +- Complete the bounded cancellation sweep by deleting the temporary `indexes/.cancelled/.json` marker after the algorithm runs. + +## Capabilities + +### New Capabilities +- `executor-cancellation`: executor-side handling of `cancel-requested`, including update-time sub-dispatch rules and teardown of external resources for script, docker, batch, cfn, and ssh flows. + +### Modified Capabilities +- `execution-admin-controls`: change cancellation from execution-id-rooted planning to index-rooted planning with per-execution lock/recheck semantics and bounded sweep completion. +- `runtime-execution-records`: extend execution-state storage so index ids can be stored and traversed as synthetic execution roots. +- `execution-call-edges`: allow rooted lineage edges whose caller id is an index id stored in the same canonical edge namespace. +- `unified-dml-surface`: add `runtime.cancel` to the shared `Dml` runtime namespace. + +## Impact + +- Affected code: `src/daggerml/_internal/dml.py`, runtime/index and remote cancellation planning, and contrib executors/adapters. +- Affected systems: execution graph traversal, index lifecycle, remote execution state and edge storage, and external executor backends such as Batch, Docker, SSH, supervisor-managed scripts, and CloudFormation. +- Caller impact: cancellation moves from execution-id-oriented internals to an index-oriented runtime API that better matches user intent. diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-admin-controls/spec.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-admin-controls/spec.md new file mode 100644 index 0000000..3030105 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-admin-controls/spec.md @@ -0,0 +1,58 @@ +## MODIFIED Requirements + +### Requirement: Manual cancellation SHALL target index identity +The system SHALL treat cancellation as an index-rooted execution-graph operation keyed by index id. A user cancellation request SHALL lock the target index, atomically move `indexes/.json` to `indexes/.cancelled/.json`, and plan cancellation from the rooted execution set initialized as `{index_id}`. + +The cancellation algorithm SHALL operate as follows: + +1. Lock the target index. +2. Move `indexes/.json` to `indexes/.cancelled/.json` atomically. +3. Release the index lock. +4. Initialize `unseen = {index_id}`. +5. While `unseen` is not empty, remove one candidate id. +6. Acquire that candidate id's lock. +7. While holding the lock, reread `exec/state/.json`; if it does not exist, release the lock and continue. +8. If `status` is `succeeded`, `failed`, or `cancelled`, release the lock and continue. +9. Add that record's `dependencies` to `unseen`. +10. While still holding the lock, count callers of that candidate id whose state exists and whose status is not `cancel-requested`, `cancelled`, `succeeded`, or `failed`. +11. If that active caller count is `0`, update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user, then invoke the candidate's adapter update path with `execution_status = "cancel-requested"`. +12. If that cancel update returns terminal `cancelled`, update `exec/state/.json` so that `status = "cancelled"`. +13. Release the candidate lock. +14. After the bounded sweep completes, delete `indexes/.cancelled/.json`. + +#### Scenario: Runtime cancel freezes the index before planning +- **WHEN** a user cancels index `idx1` +- **THEN** the system SHALL atomically move `indexes/idx1.json` to `indexes/.cancelled/idx1.json` under lock before cancellation planning begins + +#### Scenario: Rooted cancellation starts from the index id itself +- **WHEN** a user cancels index `idx1` +- **THEN** the planner SHALL initialize its rooted work set as `{idx1}` +- **AND** it SHALL read `exec/state/idx1.json` as the synthetic root state record + +#### Scenario: Cancellation marks eligible rooted executions cancel-requested +- **WHEN** a rooted execution reachable from the cancelled index has no other active callers +- **THEN** the system SHALL update `exec/state/.json` so that `status = "cancel-requested"` +- **AND** `cancel_requested_by` identifies the requesting user + +#### Scenario: Dependencies are added to the rooted work set before cancel dispatch +- **WHEN** the planner examines candidate `e1` +- **THEN** it SHALL add `e1`'s recorded `dependencies` to the rooted work set before deciding whether to invoke adapter cancellation for `e1` + +#### Scenario: Cancellation sweep removes the temporary cancelled-index marker +- **WHEN** the runtime completes the bounded cancellation sweep for index `idx1` +- **THEN** it SHALL delete `indexes/.cancelled/idx1.json` + +### Requirement: Cancellation propagation SHALL stop when a callee still has a live caller +The local planner SHALL propagate cancellation only across non-terminal rooted dependency records. It SHALL stop recursing when it reaches `succeeded`, `failed`, or `cancelled`. Among non-terminal records in the dependency closure, it SHALL request cancellation only when a candidate record has no remaining active callers. For this algorithm, an active caller is a caller record whose state exists and whose `status` is not `cancel-requested`, `cancelled`, `succeeded`, or `failed`. + +#### Scenario: Shared dependency is preserved while another caller remains live +- **WHEN** execution `e2` depends on `e3` and a different active execution `e4` also depends on `e3` +- **THEN** cancelling the rooted index for `e2` SHALL NOT require `e3` to be cancelled while `e4` remains an active caller + +#### Scenario: Cancel-requested caller is not active +- **WHEN** execution `e3` is called only by executions whose status is `cancel-requested`, `cancelled`, `succeeded`, or `failed` +- **THEN** the planner SHALL treat `e3` as having no active callers for cancellation eligibility + +#### Scenario: Terminal dependency is not cancelled +- **WHEN** execution `e2` depends on execution `e3` and `e3` is already terminal +- **THEN** cancelling the rooted index for `e2` SHALL NOT request cancellation for `e3` diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-call-edges/spec.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-call-edges/spec.md new file mode 100644 index 0000000..8cac2fc --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/execution-call-edges/spec.md @@ -0,0 +1,33 @@ +## MODIFIED Requirements + +### Requirement: Call-edge records SHALL represent realized rooted dependencies +The runtime SHALL record only realized rooted dependencies. An edge SHALL mean that caller id `caller_execution_id` was observed to depend on callee execution `callee_execution_id` during runtime execution, even if that dependency is discovered during a later `start_fn` poll cycle. The caller id MAY be either a normal execution id or a synthetic root index id. + +#### Scenario: Dependency discovered after initial launch still creates edge +- **WHEN** execution `e0` does not know about callee `e1` on its first poll but discovers that dependency on a later poll +- **THEN** the runtime SHALL create the edge record for `e1 <- e0` when that dependency becomes known + +#### Scenario: Repeated observation does not require a second edge fact +- **WHEN** execution `e0` rediscovers an existing dependency on `e1` +- **THEN** the runtime SHALL continue to treat `e1 <- e0` as one canonical edge fact + +#### Scenario: Index root creates rooted dependency edge +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL treat `e1 <- idx1` as one canonical rooted edge fact + +### Requirement: Runtime SHALL persist canonical edge records by callee execution id +The runtime SHALL persist each rooted dependency as the immutable object `exec/edges//.json`. The payload SHALL include only `caller_execution_id` and `callee_execution_id`. + +#### Scenario: Edge record is written at canonical path +- **WHEN** execution `e0` discovers a dependency on execution `e1` +- **THEN** the runtime SHALL write `exec/edges/e1/e0.json` +- **AND** that object SHALL contain JSON with `caller_execution_id = "e0"` and `callee_execution_id = "e1"` + +#### Scenario: Reverse lineage query lists callers by callee execution id +- **WHEN** an invalidation planner needs all callers of execution `e1` +- **THEN** it SHALL obtain them by reading the objects under `exec/edges/e1/` + +#### Scenario: Index root uses the same canonical edge namespace +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL write `exec/edges/e1/idx1.json` +- **AND** that object SHALL contain JSON with `caller_execution_id = "idx1"` and `callee_execution_id = "e1"` diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/executor-cancellation/spec.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/executor-cancellation/spec.md new file mode 100644 index 0000000..a0c7136 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/executor-cancellation/spec.md @@ -0,0 +1,31 @@ +## ADDED Requirements + +### Requirement: Executors SHALL handle `cancel-requested` as an update step +When the runtime invokes an executor with `execution_status = "cancel-requested"`, the executor SHALL treat that invocation as a cancellation update rather than as a fresh launch. Executors that normally dispatch to `runnable.sub` during update SHALL continue to dispatch to `runnable.sub` once in cancellation mode before performing executor-owned cleanup. Executors that do not normally dispatch to `runnable.sub` during update SHALL cancel their own external resources directly. + +#### Scenario: Update-dispatch executor forwards cancellation update +- **WHEN** an executor that normally calls `runnable.sub` on update receives `execution_status = "cancel-requested"` +- **THEN** it SHALL issue its normal update-time sub-dispatch once before executor-owned cleanup + +#### Scenario: Detached-work executor cancels backend directly +- **WHEN** an executor that does not normally call `runnable.sub` on update receives `execution_status = "cancel-requested"` +- **THEN** it SHALL cancel or tear down its own external work without invoking `runnable.sub` + +### Requirement: Executors SHALL tear down external resources during cancellation +Executor-owned cancellation SHALL tear down external resources and SHALL NOT mutate the persisted execution record `state`. Script execution SHALL terminate the supervisor-managed process tree and remove its work directory. Docker execution SHALL stop and remove the container and SHALL remove any temporary loaded image. Batch execution SHALL cancel or terminate the Batch job as appropriate and SHALL deregister the temporary job definition. CloudFormation execution SHALL initiate rollback or cancellation of the stack operation and return without waiting for the rollback to finish. SSH execution SHALL return the nested adapter's cancellation result and SHALL NOT create additional remote wrapper state. + +#### Scenario: Batch cancellation tears down Batch resources +- **WHEN** the Batch executor receives `execution_status = "cancel-requested"` +- **THEN** it SHALL cancel or terminate the Batch job and deregister the temporary job definition + +#### Scenario: CloudFormation cancellation returns quickly with rollback context +- **WHEN** the CloudFormation executor receives `execution_status = "cancel-requested"` +- **THEN** it SHALL start rollback or cancellation of the stack operation +- **AND** it SHALL return promptly with enough stack context for the caller to identify the affected stack + +### Requirement: Successful cancel updates SHALL report `cancelled` +When an executor processes a cancel update without transport or runtime exceptions, it SHALL return `status = "cancelled"` even if backend cleanup or rollback continues asynchronously. The runtime cancellation workflow SHALL treat that result as confirmation that the cancel update was handled rather than as a successful DAG execution result. + +#### Scenario: Cancel update reports success after teardown request +- **WHEN** an executor successfully processes a `cancel-requested` update +- **THEN** it SHALL return `status = "cancelled"` diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..c8e9924 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/runtime-execution-records/spec.md @@ -0,0 +1,90 @@ +## MODIFIED Requirements + +### Requirement: Runtime SHALL maintain one mutable execution object per execution id +The runtime SHALL persist `exec/state/.json` as the single compare-and-swap updated execution object for that execution. That object SHALL include `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by`, where `cancel_requested_by` is `str | null`. `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed`. `state` SHALL contain the durable adapter state returned by the first adapter call for that execution and SHALL be `null` when no durable adapter state exists. Once `state` is first written for an execution, the runtime SHALL NOT replace or merge it on later updates. `dependencies` SHALL be the deduped set of discovered callee execution ids for that execution. Execution-object updates SHALL be monotone: newly discovered dependencies MAY be added, terminal status MAY replace non-terminal status, `cancel-requested` MAY precede `cancelled`, and existing dependencies SHALL NOT be removed. + +The same execution-object schema SHALL also be used for each live index id. For index-root records, the object path SHALL be `exec/state/.json`, `execution_id` SHALL equal the `index_id`, `cache_key` SHALL equal the `index_id`, `state` SHALL be `null`, and `dependencies` SHALL track the deduped set of execution ids started from that index. + +The execution-object schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `created_at: int` +- `status: "running" | "cancel-requested" | "cancelled" | "succeeded" | "failed"` +- `state: object | null` +- `dependencies: list[str]` +- `updated_at: int` +- `cancel_requested_by: str | null` + +#### Scenario: First adapter call creates the execution object +- **WHEN** the first adapter call for a new execution returns any valid adapter result +- **THEN** the runtime SHALL create `exec/state/.json` +- **AND** that object SHALL contain the returned adapter `state` when one exists + +#### Scenario: First execution object records creation time +- **WHEN** the runtime first creates `exec/state/.json` +- **THEN** that object SHALL contain `created_at` +- **AND** `created_at` SHALL remain unchanged on later updates + +#### Scenario: Resume uses stored execution state +- **WHEN** `start_fn` resumes an active execution +- **THEN** it SHALL load the adapter `state` from `exec/state/.json` +- **AND** it SHALL pass that stored state to the adapter + +#### Scenario: Later running result does not replace stored execution state +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with a different `state` +- **THEN** the runtime SHALL keep the existing stored `state` in `exec/state/.json` + +#### Scenario: Late dependency discovery expands execution summary +- **WHEN** execution `e0` later discovers a dependency on execution `e1` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `dependencies` contains `e1` + +#### Scenario: Dependency merge survives compare-and-swap retry +- **WHEN** a compare-and-swap update to `exec/state/e0.json` observes a conflicting write +- **THEN** the runtime SHALL reread, merge the dependency set and monotone status fields, and retry the conditional write + +#### Scenario: Cancellation requester is recorded on cancel request +- **WHEN** a user requests cancellation for execution `e0` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `status = "cancel-requested"` +- **AND** `cancel_requested_by` contains the requesting user identity + +#### Scenario: Execution object includes minimal execution fields +- **WHEN** the runtime persists `exec/state/e0.json` +- **THEN** that object SHALL contain `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by` + +#### Scenario: Execution object rejects unknown status values +- **WHEN** the runtime validates or persists `exec/state/e0.json` +- **THEN** `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed` + +#### Scenario: Index id is persisted as a synthetic root execution +- **WHEN** runtime work is started from index `idx1` +- **THEN** the runtime SHALL maintain `exec/state/idx1.json` +- **AND** that object SHALL use `execution_id = "idx1"`, `cache_key = "idx1"`, and `state = null` + +#### Scenario: Index root accumulates launched execution dependencies +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `dependencies` contains `e1` + +### Requirement: Adapter envelope and result schema SHALL follow the runtime-owned execution contract +The adapter envelope SHALL include `argv_ptr`, `cache_key`, `execution_id`, `remote`, `runnable`, `state`, `execution_status`, and `cancel_requested_by`. The adapter result SHALL use only `running`, `succeeded`, `failed`, or `cancelled` statuses. `running` MUST include durable `state`. `succeeded` MUST include `dag_id`. `failed` MUST include `error`. `cancelled` MUST identify a successful cancel update and MAY omit durable execution output. + +#### Scenario: First adapter call uses null state +- **WHEN** the runtime invokes an adapter for a new execution +- **THEN** the adapter envelope SHALL include `state = null` + +#### Scenario: Cancel update includes cancellation fields +- **WHEN** the runtime invokes an adapter for a cancel update +- **THEN** the adapter envelope SHALL include `execution_status = "cancel-requested"` +- **AND** it SHALL include `cancel_requested_by` + +#### Scenario: Later adapter state is ignored after first write +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with a different `state` +- **THEN** the runtime SHALL continue using the existing stored `state` from `exec/state/.json` + +#### Scenario: Cancel update may return cancelled +- **WHEN** an executor completes a cancel update successfully +- **THEN** the adapter result MAY use `status = "cancelled"` + +#### Scenario: Pending is rejected +- **WHEN** an adapter returns `pending` +- **THEN** the runtime SHALL reject that result as invalid adapter output diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..9f788b2 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/specs/unified-dml-surface/spec.md @@ -0,0 +1,30 @@ +## MODIFIED Requirements + +### Requirement: Shared `Dml` exposes the fixed method namespaces +The shared `Dml` class SHALL expose this caller-facing method surface: + +- top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert` +- `dag`: `list`, `get`, `checkout`, `delete` +- `admin.index`: `list`, `get`, `delete` +- `admin.cache`: `invalidate` +- `admin.remote`: `list`, `gc` +- `admin`: `gc` +- `runtime`: `create`, `describe`, `put_literal`, `put_import`, `start_fn`, `cancel`, `commit` +- `config`: `get`, `set`, `show` +- `ops`: `commit`, `head`, `dag`, `node`, `index`, `cache`, `remote`, `gc`, `config` + +#### Scenario: Top-level repository methods are present +- **WHEN** a caller inspects the shared `Dml` class +- **THEN** the repository porcelain workflows are available on the top level rather than through raw subsystem factories + +#### Scenario: DAG, admin, runtime, and config methods remain namespaced +- **WHEN** a caller needs DAG inspection, admin maintenance, runtime staging behavior, or config access +- **THEN** the shared `Dml` exposes those methods under `dag`, `admin`, `runtime`, and `config` namespaces respectively + +#### Scenario: Runtime namespace exposes cancel +- **WHEN** a caller needs to cancel work rooted at an index +- **THEN** the shared `Dml` exposes that workflow as `dml.runtime.cancel(index_id)` + +#### Scenario: Exact subsystem objects are grouped under ops +- **WHEN** a caller needs direct exact-input subsystem behavior such as `CommitOps`, `HeadOps`, or `IndexOps` +- **THEN** the shared `Dml` exposes those objects under `dml.ops.*` rather than as direct top-level `Dml` attributes diff --git a/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/tasks.md b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/tasks.md new file mode 100644 index 0000000..4c73240 --- /dev/null +++ b/openspec/changes/archive/2026-05-15-index-rooted-runtime-cancel/tasks.md @@ -0,0 +1,27 @@ +## 1. Runtime Cancel Entry Point + +- [x] 1.1 Add `Dml.runtime.cancel(index_id)` and return a JSON-ready cancellation summary. +- [x] 1.2 Implement index locking and atomic move from `indexes/.json` to `indexes/.cancelled/.json`. +- [x] 1.3 Ensure cancelled indexes are treated as frozen and cannot be mutated through normal runtime index workflows. +- [x] 1.4 Persist `exec/state/.json` for live indexes and keep their rooted `dependencies` updated as executions are launched. + +## 2. Index-Rooted Cancellation Planning + +- [x] 2.1 Record rooted lineage edges in `exec/edges//.json` when an index launches an execution. +- [x] 2.2 Resolve the rooted active execution set from `{index_id}` instead of from user-supplied execution ids. +- [x] 2.3 Add per-execution locking around terminal-state recheck, dependency expansion, active-caller counting, and `cancel-requested` state updates. +- [x] 2.4 Invoke adapter update paths with `execution_status="cancel-requested"`, persist terminal `cancelled` when returned, and delete the temporary cancelled-index marker when the bounded sweep completes. + +## 3. Executor Cancellation Behavior + +- [x] 3.1 Update shared executor handling so `cancel-requested` is treated as an update step and update-dispatch executors continue sub-dispatch during cancellation. +- [x] 3.2 Implement script and docker cancel teardown for supervisor/process groups, containers, and temporary images or workdirs. +- [x] 3.3 Implement batch and cfn cancel teardown so Batch jobs are canceled and deregistered and CloudFormation starts rollback quickly with stack context. +- [x] 3.4 Confirm ssh cancel updates pass through the nested adapter result without adding extra remote wrapper state. +- [x] 3.5 Update adapter/executor validation so `cancelled` is accepted as a terminal cancel result. + +## 4. Verification + +- [x] 4.1 Add or update contract tests for `runtime.cancel`, cancelled-index freezing, and rooted cancellation planning. +- [x] 4.2 Add or update contrib executor tests covering cancel-update behavior for ssh, batch, docker, script, and cfn. +- [x] 4.3 Run the relevant targeted test suites for internal runtime cancellation and contrib executor cancellation. diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/.openspec.yaml b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/.openspec.yaml new file mode 100644 index 0000000..ab7f13b --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-16 diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/design.md b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/design.md new file mode 100644 index 0000000..127ccb1 --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/design.md @@ -0,0 +1,116 @@ +## Context + +The runtime already treats `cache_key` as the computation identity for execution startup and resume. `start_fn` acquires the coordination lock by `cache_key`, reads the active execution pointer for that cache key, and either launches or resumes a single in-flight execution attempt. Cancellation diverges from that model today by locking the candidate execution id directly, even though the execution record already stores the canonical `cache_key` for the work being cancelled. + +The requested change keeps index-rooted cancellation as the entry point, but makes its remote coordination consistent with normal execution. The synthetic index execution record remains the root of the cancellation graph, and its recorded execution-id dependencies continue to drive cancellation planning. + +This change also redefines cancellation from an incremental bounded sweep into a full rooted-graph pass followed by a retryable cancellation loop. `Dml.runtime.cancel` owns that loop, emits diagnostics about each pass, and returns a structured statistics object. Each cancellation attempt must traverse the full execution graph reachable from the index, collect the full caller-callee graph, derive the rooted candidate executions, and then repeatedly attempt short-lived per-candidate cancellation steps until the rooted candidate set is exhausted. + +## Goals / Non-Goals + +**Goals:** +- Make cancellation acquire the same remote coordination lock identity as launch and resume: `cache_key`. +- Ensure the synthetic index-root execution transitions to `cancel-requested` before any graph work begins and to `cancelled` only after the rooted graph is fully cancelled. +- Traverse the full rooted execution graph on every cancellation attempt and derive rooted cancellation candidates from that graph. +- Evaluate cancellation eligibility against the global caller set stored in S3, not only the rooted traversal graph. +- Use short-lived per-candidate cache-key locks inside a retryable loop rather than one long-lived batch lock. +- Preserve rooted cancellation planning by execution id while resolving lock identity from each candidate execution record. +- Reduce races between cancellation and concurrent `start_fn` activity for the same computation. + +**Non-Goals:** +- Redesign the execution graph model, dependency recording, or invalidation flow. +- Change adapter payload shape beyond the existing `execution_status = "cancel-requested"` update path. +- Introduce new persistent coordination objects beyond the existing execution record, active pointer, and lock objects. + +## Decisions + +### 1. Cancellation will resolve `cache_key` from the execution record before locking + +Cancellation starts from execution ids because dependency edges and rooted planning are keyed by execution id. The lock, however, protects the computation identity, not the lineage identity. The cancellation path should therefore read `exec/state/.json`, obtain `cache_key`, construct `ExecutionState(cache_key, ...)`, and acquire that lock before re-reading and mutating the candidate execution record. + +Alternative considered: keep locking by `execution_id` and document cancellation as a special case. Rejected because it preserves the current mismatch and still allows cancellation to coordinate independently from launch/resume for the same computation. + +### 2. The index root record will be marked `cancel-requested` before any graph work begins + +The synthetic index execution record represents the rooted cancellation request itself. Marking it `cancel-requested` first makes the remote state reflect that cancellation is in progress before traversal, caller counting, or adapter cancellation begins. This is also required for correct active-caller counting: descendants must observe that their rooted caller is already in `cancel-requested` rather than still appearing live. The index root should only move to `cancelled` after the entire rooted graph has been cancelled without error. + +Alternative considered: leave the index root as `running` until the end and mark only real executions. Rejected because it hides cancellation progress from readers and leaves the root record inconsistent with the requested operation. + +### 3. Cancellation will use a two-phase algorithm: graph discovery, then retryable cancellation + +Each cancellation attempt should first walk the full execution graph rooted at the index's direct dependencies and collect `graph := {(caller, callee), ...}`. From that graph, the runtime derives `candidate_set := {callee}` and seeds `own_executions := candidate_set.copy()`. Those sets define the rooted cancellation universe for this pass. + +The runtime should then run a loop over the remaining `candidate_set`. In parallel across the current candidate set, it should attempt to acquire the candidate's cache-key lock, return immediately on lock contention, recompute the candidate's active callers from the global reverse-edge set in S3, and decide one of three outcomes: keep retrying, drop the candidate from `candidate_set`, or drop it from both `candidate_set` and `own_executions`. This makes cancellation retryable, keeps lock hold times short, and allows ownership to shrink as externally referenced executions are identified. + +Alternative considered: continue the current incremental work-queue sweep that interleaves discovery and cancellation decisions. Rejected because it couples graph traversal to transient node status, makes the meaning of `cancelled` leak into graph completion, and makes active-caller results depend on discovery order. + +### 4. Caller ownership must use the global reverse-edge set + +`callers(c)` must mean all recorded callers of `c` from the reverse-edge records in S3, not only callers discovered while traversing the cancelled index's rooted graph. If index `A` and index `B` both call `X`, then cancelling `A` must not cancel `X`. If `A` calls `X` and `Y`, and `Y` calls `X`, then `cancel(A)` may cancel `X` when `callers(X) = {A, Y}` because every caller belongs to the cancelled index's rooted ownership set. + +Alternative considered: infer ownership only from the rooted traversal graph. Rejected because it can wrongly cancel shared executions that still have callers from unrelated indexes or execution subgraphs. + +### 5. Candidate processing is loop-based and short-lock + +For each candidate in the current loop iteration, `Dml.runtime.cancel` should attempt to lock, inspect, act, and unlock quickly. The worker returns `None` when it could not make progress, `-1` when the candidate is discovered not to be fully owned by the cancelled index, and `+1` when the candidate reached per-execution `cancelled`. The outer loop removes `+1` candidates from `candidate_set` and removes `-1` candidates from both `candidate_set` and `own_executions`, logs iteration diagnostics, and repeats until `candidate_set` is empty. + +Alternative considered: hold one long-lived lock batch across the whole owned subgraph. Rejected because it increases contention and makes retries more expensive. + +### 6. `Dml.runtime.cancel` will return structured cancellation statistics + +`Dml.runtime.cancel` should return a deterministic summary object rather than ad hoc diagnostics. At minimum, that object should report the target `index_id`, total loop `iterations`, the size of the rooted `candidate_set` discovered during graph traversal, how many candidates were retained in `own_executions`, how many per-execution cancellations completed, how many candidates were removed because they had external active callers or were otherwise not eligible, and how many lock-contention retries occurred. This gives operators a stable surface for observability and tests a precise contract to assert. + +Alternative considered: return only a boolean or log-only diagnostics. Rejected because it hides useful convergence information and makes cancellation behavior harder to validate automatically. + +### 7. Candidate locking will use a two-step read: resolve, then lock, then re-read + +The cancellation path should first read the candidate execution record without a lock to discover `cache_key`, then acquire the cache-key lock, then re-read the same execution record while holding that lock before making cancellation decisions. This preserves correctness if the record changed between the first read and lock acquisition. + +Alternative considered: add a new direct mapping from execution id to lock key. Rejected because the execution record already provides the needed mapping and is the authoritative source. + +### 8. `cancelled` is a per-execution cleanup-complete status, not a graph-complete status + +For a non-index execution, `status = "cancelled"` means that execution's cleanup is complete and the index-cancellation runtime does not need to invoke that execution's adapter chain again. It does not mean traversal can stop at that node or that the rooted graph has been fully cancelled. The index remains responsible for traversing the full graph on every cancellation attempt until the index itself can be marked `cancelled`. + +Alternative considered: treat a `cancelled` execution as a graph-terminal pruning point. Rejected because descendant executions may still require cancellation work even when a parent execution's own cleanup is done. + +### 9. Completion cleanup remains gated on successful full-graph completion + +The temporary cancelled-index marker should still be removed only after the rooted graph has been fully processed and the index-cancellation runtime can conclude the graph is cancelled. If the cancellation pass fails, the marker should remain so cancellation can be retried, while the index root record remains `cancel-requested` rather than being advanced to `cancelled` prematurely. + +### 10. Terminal `cancelled` state is owned by the index-cancellation runtime + +Neither the remote execution runtime nor any single adapter in the adapter chain can authoritatively decide that an execution is fully `cancelled`. Each adapter layer may own cleanup for its own state, and a callee adapter does not know whether it is the leaf, the root, or an intermediate step in the chain. Because of that, a remote cancel update can report that one adapter handled its own cancellation work, but the index-cancellation runtime must continue driving the full adapter chain until every participating layer has had a chance to process cancellation. Only the index-cancellation runtime should persist terminal `cancelled` state, and only after that full chain completes. + +Alternative considered: allow the first remote adapter or executor that returns `cancelled` to finalize the whole execution. Rejected because that can strand cleanup work in outer or sibling adapter layers that have not yet run their own cancellation handling. + +### 11. Unreachable remote-only adapter chains remain a known limitation + +Today, `Dml.runtime.cancel` can only actively drive cancellation work for adapter chains that are reachable from the index runtime process. If execution `X` was started by the index runtime but `X` then delegated to execution `Y` through bespoke adapters that only exist on some remote machine, the index runtime may be able to mark `Y` as `cancel-requested` but may not be able to invoke the adapter path that would actually finish cancelling `Y`. In that case, the retry loop can continue indefinitely until the user interrupts it, typically with `Ctrl+C`. + +This is an accepted limitation for now. The current design relies on `cancel-requested` as the durable propagation signal so future remote executors and bespoke adapter stacks can learn to observe that state and complete their own cancellation work without requiring the index runtime to invoke them directly. + +Alternative considered: block this change until every remote-only executor path can autonomously react to `cancel-requested`. Rejected because the coordination and state-model improvements are still valuable now, even though some remote execution environments will remain partially manual. + +## Risks / Trade-offs + +- [Extra record read before locking] -> Mitigation: cancellation already depends on execution-record reads; add tests that cover the resolve-lock-reread sequence. +- [Longer overall cancellation pass due to full-graph traversal] -> Mitigation: separate discovery from retryable cancellation processing, reuse stored dependency edges, and keep per-node cancellation work bounded within short lock windows. +- [Global caller reads may be stale relative to lock acquisition] -> Mitigation: recompute active callers under lock before acting and retry on later loop iterations. +- [Lock contention causes slow convergence] -> Mitigation: make each worker return `None` on lock failure and let `Dml.runtime.cancel` retry without holding unrelated locks; document contention as a sharp edge and keep locks intentionally short-lived. +- [Synthetic index root may remain `cancel-requested` after a failed sweep] -> Mitigation: treat that state as a visible retryable cancellation-in-progress marker and keep the cancelled-index pointer for retried cleanup. +- [Spec drift between launch/resume and cancellation locking semantics] -> Mitigation: update both cancellation and runtime execution-record capabilities in the same change. +- [A remote adapter reports `cancelled` before outer adapter cleanup has run] -> Mitigation: keep terminal `cancelled` state owned by the index-cancellation runtime and continue driving cancellation through the full adapter chain before finalizing state. +- [Per-execution `cancelled` status is mistaken for graph completion] -> Mitigation: define `cancelled` explicitly as a per-execution status and make index `cancelled` contingent on full rooted-graph completion. +- [Remote-only bespoke adapter chains are unreachable from the index runtime] -> Mitigation: document this as a sharp edge, persist `cancel-requested` for those executions, and expect user interruption until downstream runtimes adopt autonomous `cancel-requested` handling. + +## Migration Plan + +1. Update the cancellation contract and runtime locking contract. +2. Change the cancellation implementation to traverse the full rooted graph, compute `graph`, seed `candidate_set` and `own_executions`, and drive a retryable cancellation loop using global caller ownership and short-lived cache-key locks. +3. Update contract tests for full-graph traversal, global caller ownership, loop outcomes (`None`, `-1`, `+1`), per-execution `cancelled` semantics, and index-root status transitions. +4. Rollback, if needed, is a code revert; no persisted schema migration is required. + +## Open Questions + +- None. Lock contention and unreachable remote-only adapter chains remain documented sharp edges, but both are accepted for this design phase. diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/proposal.md b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/proposal.md new file mode 100644 index 0000000..6748a45 --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/proposal.md @@ -0,0 +1,29 @@ +## Why + +Cancellation currently coordinates work by locking the candidate execution id, which conflicts with the runtime's broader model that uses `cache_key` as the computation identity for execution coordination. This mismatch makes cancellation race against normal launch and resume paths and obscures how an index-rooted cancellation request should transition remote execution state. + +## What Changes + +- Change cancellation coordination to resolve each candidate execution's `cache_key` from its execution record and acquire the cache-key lock before mutating execution state or invoking cancel updates. +- Change cancellation planning to traverse the full execution graph rooted at the index, derive rooted candidates from that graph, and then drive cancellation through a retryable short-lock loop owned by `Dml.runtime.cancel`. +- Update the cancellation workflow so the synthetic index-root execution is marked `cancel-requested` first, and only transitions to `cancelled` after the full rooted graph has been cancelled successfully. +- Clarify that rooted cancellation starts from the index's synthetic execution record and expands through its recorded execution-id dependencies while using cache-key locks for real execution candidates. +- Clarify that execution `cancelled` status is per-execution only: it means cleanup for that execution is complete and its adapter no longer needs to be called, but graph traversal must still continue until the index-rooted graph is fully cancelled. +- Have `Dml.runtime.cancel` log cancellation diagnostics and return loop statistics such as iteration counts. +- Explicitly document the current sharp edge that if a descendant execution can only be cancelled by adapters unreachable from the index runtime, `Dml.runtime.cancel` may loop indefinitely until the user interrupts it, with `cancel-requested` serving as the only current propagation signal. + +## Capabilities + +### New Capabilities +None. + +### Modified Capabilities +- `execution-admin-controls`: Change the manual cancellation algorithm to mark the index root `cancel-requested`, traverse the full rooted graph, evaluate global caller ownership from S3-backed reverse edges, and run a retryable cache-key lock loop inside `Dml.runtime.cancel` while cancelling rooted executions. +- `runtime-execution-records`: Clarify that cancellation also acquires execution coordination locks by `cache_key`, even when the operation starts from an execution id or synthetic index id, and that `cancelled` is a per-execution cleanup-complete status rather than a graph-complete status. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/index.py`, `src/daggerml/_internal/exec_state.py`, and cancellation-focused contract tests. +- Affected behavior: cancellation graph traversal, cancellation state transitions, cancellation lock acquisition, and interaction between cancellation and concurrent execution/resume paths. +- Affected operational behavior: cancellation may remain user-interrupt-driven for remote-only bespoke adapter chains that do not yet honor `cancel-requested` autonomously. +- No new public API surface is expected, but `Dml.runtime.cancel` behavior, returned cancellation statistics, runtime coordination semantics, and cancellation tests will change. diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/execution-admin-controls/spec.md b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/execution-admin-controls/spec.md new file mode 100644 index 0000000..9d4f419 --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/execution-admin-controls/spec.md @@ -0,0 +1,156 @@ +## MODIFIED Requirements + +### Requirement: Manual cancellation SHALL target index identity +The system SHALL treat cancellation as an index-rooted execution-graph operation keyed by index id. `Dml.runtime.cancel` SHALL lock the target index, atomically move `indexes/.json` to `indexes/.cancelled/.json`, mark the synthetic index execution record `cancel-requested`, traverse the full rooted execution graph, and run a retryable cancellation loop over the rooted candidate executions. + +The cancellation algorithm SHALL operate as follows: + +1. Lock the target index. +2. Move `indexes/.json` to `indexes/.cancelled/.json` atomically. +3. Release the index lock. +4. Ensure `exec/state/.json` exists as the synthetic root state record. +5. Update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user before any descendant cancellation work begins. +6. Traverse the full execution graph rooted at the synthetic root record's `dependencies` and collect `graph := {(caller, callee), ...}`. +7. Define `candidate_set := {callee | (caller, callee) in graph}` and `own_executions := candidate_set.copy()`. +8. While `candidate_set` is not empty, `Dml.runtime.cancel` SHALL run a parallel worker across the current `candidate_set` and log loop diagnostics. +9. For each candidate execution id, attempt to acquire the candidate's `cache_key` lock; if lock acquisition fails, return `None` for that candidate. +10. While holding the lock, reread `exec/state/.json`; if it does not exist, release the lock and return `-1`. +11. While holding the lock, read `active_callers(c)` for the candidate from the global reverse-edge records in S3 and determine current `status`. +12. If `len(active_callers(c) - own_executions) > 0` or the candidate is not in an active status, release the lock and return `-1`. +13. Otherwise, update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user before invoking that candidate's adapter update path with `execution_status = "cancel-requested"`. +14. If the candidate's full adapter chain reaches terminal `cancelled`, update `exec/state/.json` so that `status = "cancelled"`, release the lock, and return `+1`. +15. Otherwise, release the lock and return `None`. +16. After one loop iteration completes, remove every `+1` candidate from `candidate_set` and remove every `-1` candidate from both `candidate_set` and `own_executions`. +17. Repeat until `candidate_set` is empty. +18. After every execution remaining in `own_executions` has status `cancelled`, update `exec/state/.json` so that `status = "cancelled"`. +19. After the rooted graph has been cancelled successfully for the index-owned executions, delete `indexes/.cancelled/.json`. +20. `Dml.runtime.cancel` SHALL return a cancellation statistics object. + +`Dml.runtime.cancel` MAY continue looping indefinitely when a candidate execution can only be fully cancelled by adapters or runtimes that are unreachable from the index runtime process. In that case, the runtime SHALL continue persisting and observing `cancel-requested` state but is not required to guarantee autonomous completion. + +The cancellation statistics object SHALL have the following schema: + +- `index_id: str` +- `iterations: int` +- `graph_edges: int` +- `candidate_count: int` +- `own_execution_count: int` +- `cancelled_count: int` +- `dropped_count: int` +- `lock_retry_count: int` + +#### Scenario: Runtime cancel freezes the index before planning +- **WHEN** a user cancels index `idx1` +- **THEN** the system SHALL atomically move `indexes/idx1.json` to `indexes/.cancelled/idx1.json` under lock before cancellation planning begins + +#### Scenario: Rooted cancellation starts from the index root dependencies +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `status = "cancel-requested"` +- **AND** it SHALL initialize rooted graph traversal from `exec/state/idx1.json` dependencies rather than from `{idx1}` itself + +#### Scenario: Root cancellation is recorded before descendant work +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL persist `exec/state/idx1.json` with `status = "cancel-requested"` before counting callers for descendants or invoking any adapter cancellation updates + +#### Scenario: Cancellation discovers the full rooted graph before processing +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL traverse the full execution graph reachable from `exec/state/idx1.json` dependencies +- **AND** it SHALL collect caller-callee edges for the full rooted graph before processing cancellation decisions for candidate executions + +#### Scenario: Candidate and ownership sets are initialized from rooted traversal +- **WHEN** rooted graph traversal for index `idx1` produces caller-callee graph `G` +- **THEN** the runtime SHALL derive `candidate_set` from the callee nodes in `G` +- **AND** it SHALL initialize `own_executions` as a copy of `candidate_set` + +#### Scenario: Caller ownership uses the global reverse-edge set +- **WHEN** index `A` and unrelated index `B` both call execution `X` +- **THEN** `callers(X)` SHALL include both `A` and `B` +- **AND** `cancel(A)` SHALL NOT cancel `X` + +#### Scenario: Recursive ownership remains cancellable +- **WHEN** index `A` calls `X` and `Y` +- **AND** `Y` calls `X` +- **AND** the global caller set is `callers(X) = {A, Y}` +- **THEN** `cancel(A)` MAY cancel `X` + +#### Scenario: Candidate lock contention yields retry +- **WHEN** the loop examines candidate execution `e1` +- **AND** `e1`'s cache-key lock cannot be acquired +- **THEN** the worker SHALL return `None` +- **AND** the loop SHALL leave `e1` in `candidate_set` for retry + +#### Scenario: Cancellation loop reports diagnostics +- **WHEN** `Dml.runtime.cancel` runs one or more cancellation loop iterations +- **THEN** it SHALL emit diagnostics describing loop progress + +#### Scenario: Cancellation returns loop statistics +- **WHEN** `Dml.runtime.cancel` completes for index `idx1` +- **THEN** it SHALL return cancellation statistics +- **AND** those statistics SHALL include the number of loop iterations + +#### Scenario: Cancellation statistics report rooted graph size +- **WHEN** rooted graph traversal for `idx1` collects 7 caller-callee edges and 4 candidate executions +- **THEN** the returned statistics SHALL include `graph_edges = 7` +- **AND** they SHALL include `candidate_count = 4` + +#### Scenario: Cancellation statistics report loop outcomes +- **WHEN** one cancellation run for `idx1` cancels 2 executions, drops 1 execution from ownership, and retries 3 lock-contention events +- **THEN** the returned statistics SHALL include `cancelled_count = 2` +- **AND** they SHALL include `dropped_count = 1` +- **AND** they SHALL include `lock_retry_count = 3` + +#### Scenario: Cancellation statistics identify the target index +- **WHEN** `Dml.runtime.cancel` completes for index `idx1` +- **THEN** the returned statistics SHALL include `index_id = "idx1"` + +#### Scenario: Candidate cancellation runs only without active callers +- **WHEN** the planner examines candidate execution `e1` +- **AND** `e1` still has at least one active caller outside `own_executions` +- **THEN** the runtime SHALL NOT mark `exec/state/e1.json` as `cancel-requested` +- **AND** it SHALL NOT invoke adapter cancellation for `e1` +- **AND** it SHALL NOT mark `exec/state/e1.json` as `cancelled` +- **AND** it SHALL remove `e1` from both `candidate_set` and `own_executions` for the current cancellation run + +#### Scenario: Candidate cancel request is recorded before cancellation work +- **WHEN** the planner examines candidate execution `e1` +- **AND** `e1` has no active callers outside `own_executions` +- **THEN** the runtime SHALL persist `exec/state/e1.json` with `status = "cancel-requested"` before invoking adapter cancellation for `e1` + +#### Scenario: Active callers are rechecked under lock +- **WHEN** execution `e1` is in the current `candidate_set` +- **AND** the runtime has acquired `e1`'s cache-key lock for the current loop iteration +- **THEN** it SHALL recompute `e1`'s active-caller set before marking `cancel-requested` or invoking adapter cancellation + +#### Scenario: Terminal cancelled waits for the full adapter chain +- **WHEN** execution `e1` has no active callers outside `own_executions` +- **AND** one adapter layer reports cancellation progress before outer adapter cleanup has finished +- **THEN** the runtime SHALL keep `exec/state/e1.json` at `status = "cancel-requested"` +- **AND** the index-cancellation runtime SHALL NOT persist `status = "cancelled"` until the full adapter chain has completed cancellation handling + +#### Scenario: Unreachable remote-only adapter chain can stall cancellation +- **WHEN** execution `e1` delegates cancellation work to descendant execution `e2` +- **AND** completing cancellation for `e2` requires bespoke adapters or a runtime unreachable from the index runtime process +- **THEN** `Dml.runtime.cancel` MAY continue retrying without converging to terminal `cancelled` +- **AND** it SHALL keep the relevant execution records at `status = "cancel-requested"` until another runtime handles cancellation or the user interrupts the loop + +#### Scenario: Cancelled execution still does not prune graph traversal +- **WHEN** execution `e1` is already `cancelled` +- **AND** `e1` has recorded dependencies +- **THEN** the runtime SHALL NOT invoke adapter cancellation for `e1` +- **AND** it SHALL still include `e1`'s descendants in rooted graph traversal for index cancellation + +#### Scenario: Successful cancellation removes only the candidate from the retry set +- **WHEN** the loop worker for execution `e1` returns `+1` +- **THEN** the runtime SHALL remove `e1` from `candidate_set` +- **AND** it SHALL keep `e1` in `own_executions` + +#### Scenario: Failed ownership removes candidate from both sets +- **WHEN** the loop worker for execution `e1` returns `-1` +- **THEN** the runtime SHALL remove `e1` from `candidate_set` +- **AND** it SHALL remove `e1` from `own_executions` + +#### Scenario: Cancellation sweep marks the synthetic root cancelled after graph completion +- **WHEN** the runtime completes the retry loop for index `idx1` +- **AND** every execution remaining in `own_executions` has status `cancelled` +- **THEN** it SHALL update `exec/state/idx1.json` so that `status = "cancelled"` +- **AND** it SHALL delete `indexes/.cancelled/.json` diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..5d10623 --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/specs/runtime-execution-records/spec.md @@ -0,0 +1,97 @@ +## MODIFIED Requirements + +### Requirement: Runtime SHALL separate cache identity from execution identity +The runtime SHALL treat `cache_key` as the stable computation identity and `execution_id` as the stable identity of one execution attempt. The runtime SHALL acquire execution coordination locks by `cache_key` for launch, resume, and cancellation, SHALL propagate `execution_id` in the adapter envelope, and SHALL use execution id as the identity for dependency edges, execution state objects, and invalidation records. + +#### Scenario: First launch creates a new execution identity +- **WHEN** `start_fn` observes a cache miss and confirms there is no active execution for the computed `cache_key` +- **THEN** it creates a new `execution_id` for that launch attempt +- **AND** it invokes the adapter with both `cache_key` and `execution_id` + +#### Scenario: Resume preserves the current execution identity +- **WHEN** `start_fn` observes an active execution for a `cache_key` +- **THEN** it SHALL reuse the referenced `execution_id` +- **AND** it SHALL NOT create a new `execution_id` for that execution while resuming it + +#### Scenario: Cancellation resolves lock identity from the execution record +- **WHEN** cancellation targets execution `e1` +- **AND** `exec/state/e1.json` records `cache_key = "ck1"` +- **THEN** the runtime SHALL acquire the execution coordination lock for `ck1` +- **AND** it SHALL continue to use `e1` as the execution-record and dependency-graph identity + +### Requirement: Runtime SHALL maintain one mutable execution object per execution id +The runtime SHALL persist `exec/state/.json` as the single compare-and-swap updated execution object for that execution. That object SHALL include `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by`, where `cancel_requested_by` is `str | null`. `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed`. `state` SHALL contain the durable adapter state returned by the first adapter call for that execution and SHALL be `null` when no durable adapter state exists. Once `state` is first written for an execution, the runtime SHALL NOT replace or merge it on later updates. `dependencies` SHALL be the deduped set of discovered callee execution ids for that execution. Execution-object updates SHALL be monotone: newly discovered dependencies MAY be added, terminal status MAY replace non-terminal status, `cancel-requested` MAY precede `cancelled`, and existing dependencies SHALL NOT be removed. + +The same execution-object schema SHALL also be used for each live index id. For index-root records, the object path SHALL be `exec/state/.json`, `execution_id` SHALL equal the `index_id`, `cache_key` SHALL equal the `index_id`, `state` SHALL be `null`, and `dependencies` SHALL track the deduped set of execution ids started from that index. + +The execution-object schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `created_at: int` +- `status: "running" | "cancel-requested" | "cancelled" | "succeeded" | "failed"` +- `state: object | null` +- `dependencies: list[str]` +- `updated_at: int` +- `cancel_requested_by: str | null` + +#### Scenario: First adapter call creates the execution object +- **WHEN** the first adapter call for a new execution returns any valid adapter result +- **THEN** the runtime SHALL create `exec/state/.json` +- **AND** that object SHALL contain the returned adapter `state` when one exists + +#### Scenario: First execution object records creation time +- **WHEN** the runtime first creates `exec/state/.json` +- **THEN** that object SHALL contain `created_at` +- **AND** `created_at` SHALL remain unchanged on later updates + +#### Scenario: Resume uses stored execution state +- **WHEN** `start_fn` resumes an active execution +- **THEN** it SHALL load the adapter `state` from `exec/state/.json` +- **AND** it SHALL pass that stored state to the adapter + +#### Scenario: Later running result does not replace stored execution state +- **WHEN** the runtime invokes an adapter for an existing execution and the adapter returns `running` with durable `state` +- **THEN** the runtime SHALL keep the existing stored `state` in `exec/state/.json` + +#### Scenario: Late dependency discovery expands execution summary +- **WHEN** execution `e0` later discovers a dependency on execution `e1` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `dependencies` contains `e1` + +#### Scenario: Dependency merge survives compare-and-swap retry +- **WHEN** a compare-and-swap update to `exec/state/e0.json` observes a conflicting write +- **THEN** the runtime SHALL reread, merge the dependency set and monotone status fields, and retry the conditional write + +#### Scenario: Cancellation requester is recorded before cancellation work +- **WHEN** a user requests cancellation for execution `e0` +- **THEN** the runtime SHALL update `exec/state/e0.json` so that `status = "cancel-requested"` before invoking adapter cancellation work for `e0` +- **AND** `cancel_requested_by` contains the requesting user identity + +#### Scenario: Cancelled is per-execution cleanup completion +- **WHEN** the index-cancellation runtime persists `exec/state/e0.json` with `status = "cancelled"` +- **THEN** it SHALL mean cleanup for execution `e0` is complete +- **AND** it SHALL mean the index-cancellation runtime does not need to invoke adapter cancellation for `e0` again +- **AND** it SHALL NOT by itself mean the rooted execution graph is fully cancelled + +#### Scenario: Terminal cancelled is owned by the index-cancellation runtime across the full adapter chain +- **WHEN** a cancellation update for execution `e0` returns progress from one adapter layer +- **AND** other adapter layers in the chain may still require cleanup +- **THEN** the runtime SHALL keep `exec/state/e0.json` at `status = "cancel-requested"` +- **AND** the index-cancellation runtime SHALL persist `status = "cancelled"` only after the full adapter chain has completed cancellation handling + +#### Scenario: Execution object includes minimal execution fields +- **WHEN** the runtime persists `exec/state/e0.json` +- **THEN** that object SHALL contain `execution_id`, `cache_key`, `created_at`, `status`, `state`, `dependencies`, `updated_at`, and `cancel_requested_by` + +#### Scenario: Execution object rejects unknown status values +- **WHEN** the runtime validates or persists `exec/state/e0.json` +- **THEN** `status` SHALL be one of `running`, `cancel-requested`, `cancelled`, `succeeded`, or `failed` + +#### Scenario: Index id is persisted as a synthetic root execution +- **WHEN** runtime work is started from index `idx1` +- **THEN** the runtime SHALL maintain `exec/state/idx1.json` +- **AND** that object SHALL use `execution_id = "idx1"`, `cache_key = "idx1"`, and `state = null` + +#### Scenario: Index root accumulates launched execution dependencies +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `dependencies` contains `e1` diff --git a/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/tasks.md b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/tasks.md new file mode 100644 index 0000000..acfd1c4 --- /dev/null +++ b/openspec/changes/archive/2026-05-16-fix-cancellation-cache-key-locking/tasks.md @@ -0,0 +1,27 @@ +## 1. Update Cancellation Coordination + +- [x] 1.1 Change `IndexOps.cancel` and its helper flow to mark the synthetic index execution record `cancel-requested` before any graph traversal or caller counting begins. +- [x] 1.2 Change cancellation planning to traverse the full rooted execution graph, collect caller-callee edges, and derive `candidate_set` plus `own_executions` for the current cancellation run. +- [x] 1.3 Implement the retry loop so each candidate attempts a short-lived cache-key lock, rechecks global active callers under lock, and returns `None`, `-1`, or `+1` to drive set updates. +- [x] 1.4 Keep the cancelled-index marker and synthetic index record in sync by setting the index status to `cancelled` only after the full rooted graph has been cancelled successfully. +- [x] 1.5 Make `Dml.runtime.cancel` own the retry loop, emit diagnostics for each pass, and return a structured cancellation statistics object. + +## 2. Align Runtime Locking Semantics + +- [x] 2.1 Refactor any cancellation helpers that currently construct `ExecutionState` from `execution_id` so execution coordination locks are always acquired by `cache_key`. +- [x] 2.2 Preserve execution-id-based dependency traversal and adapter cancel updates while ensuring short-lived cache-key lock acquisition does not break missing-record paths or already-cancelled per-execution fast paths. +- [x] 2.3 Recompute active callers from the global reverse-edge records in S3 under lock before marking `cancel-requested` or invoking adapter cancellation. +- [x] 2.4 Ensure per-execution `cancelled` means "no more adapter calls for this execution" without treating it as permission to prune descendant traversal. + +## 3. Verify Behavior + +- [x] 3.1 Update contract tests for cancellation to assert full rooted-graph traversal, `candidate_set`/`own_executions` initialization, global caller ownership, and index-root `cancel-requested` seeding. +- [x] 3.2 Add or adjust tests covering per-execution `cancelled` semantics so already-cancelled executions skip adapter work but still do not prune descendant traversal. +- [x] 3.3 Add or adjust tests covering loop outcomes (`None`, `-1`, `+1`), lock-contention retry, and active-caller rechecks under lock. +- [x] 3.4 Add or adjust tests covering retry/failure behavior so a failed cancellation sweep leaves the cancelled-index marker and does not prematurely mark the synthetic index root `cancelled`. +- [x] 3.5 Add or adjust tests covering `Dml.runtime.cancel` diagnostics and the returned cancellation statistics schema and counters. +- [x] 3.6 Run the relevant cancellation and execution contract test suites and confirm they pass. + +## 4. Known Limitations + +- [x] 4.1 Document that unreachable remote-only bespoke adapter chains can leave `Dml.runtime.cancel` retrying until user interruption, with `cancel-requested` as the only current propagation signal. diff --git a/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/.openspec.yaml b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/.openspec.yaml new file mode 100644 index 0000000..231e3ab --- /dev/null +++ b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-18 diff --git a/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/design.md b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/design.md new file mode 100644 index 0000000..af7cbb9 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/design.md @@ -0,0 +1,74 @@ +## Context + +`Dml` is now the shared orchestration boundary for the CLI and public API wrappers, but the surface is still weakly self-described at runtime. Callers can inspect method names and signatures, yet class purpose, method behavior, and parameter intent are mostly absent, which makes `help(...)`, editor assistance, and future introspection-driven tooling less useful than the stabilized surface now allows. + +This change is intentionally documentation-oriented rather than behavioral. The motivation is future programmatic CLI derivation from `Dml`, but that generation work is explicitly out of scope here. The immediate job is to make the existing `Dml` surface carry enough structured and human-readable metadata that later tooling can consume it without requiring `_cli` to duplicate command descriptions. + +## Goals / Non-Goals + +**Goals:** +- Make the shared `Dml` surface and its reachable namespaces meaningfully self-describing through runtime introspection. +- Add concise class docstrings that explain the purpose of `Dml` and each namespace object. +- Add concise method docstrings that explain operation behavior, constraints, and side effects. +- Add `typing.Annotated` metadata to public `Dml` and namespace method parameters so parameter help is available in a machine-readable form. +- Establish one consistent documentation split: defaults in signatures, parameter meaning in `Annotated`, and behavioral context in docstrings. + +**Non-Goals:** +- Generating CLI parsers, flags, or command trees from `Dml` in this change. +- Auto-synthesizing docstrings from `Annotated` metadata. +- Renaming underscored namespace classes or otherwise redesigning the public object model. +- Changing runtime behavior, payload shapes, or CLI grammar. + +## Decisions + +### Docstrings and `Annotated` serve different roles + +Class docstrings will describe what a namespace or boundary is for. Method docstrings will describe what the operation does, including any notable constraints or side effects. `Annotated` metadata will document what each user-facing parameter means. + +Rationale: +- Python does not automatically merge `Annotated` metadata into docstrings or `help(...)` prose. +- Keeping behavior in docstrings and argument meaning in `Annotated` avoids large repetitive parameter sections while still giving future tooling structured help text. + +Alternatives considered: +- Put all parameter documentation in docstrings and skip `Annotated`. Rejected because it leaves future CLI-oriented tooling without structured per-parameter metadata. +- Put all documentation in `Annotated` and keep docstrings minimal or absent. Rejected because class and method purpose would remain poorly expressed for human readers and `help(...)` usage. + +### Signature defaults remain the source of truth for defaults + +Default values will remain encoded only in the Python signature. `Annotated` metadata may explain the meaning of a defaulted parameter but will not restate the literal default unless an example is needed to clarify accepted forms. + +Rationale: +- The signature already exposes optionality and default values in a canonical place. +- Repeating defaults in metadata would create unnecessary drift risk for future introspection consumers. + +Alternatives considered: +- Repeat defaults inside `Annotated` help strings. Rejected because it duplicates information already present in the signature and makes later edits easier to miss. + +### The metadata scope includes namespaced methods, not just top-level `Dml` methods + +This change will cover top-level `Dml` methods and the methods reachable through `dml.config`, `dml.runtime`, `dml.dag`, and `dml.admin` sub-namespaces. + +Rationale: +- The future CLI shape maps naturally onto those namespaces, so structured help metadata is only useful if it is applied consistently across the whole public surface. +- Restricting metadata to top-level methods would leave the most CLI-like command groups undocumented at the parameter level. + +Alternatives considered: +- Annotate only top-level `Dml` methods for a smaller first pass. Rejected because it would produce an uneven introspection contract and weaken the CLI-generation motivation. + +### `Annotated` metadata uses concise string help text + +Parameter metadata will use plain string payloads inside `typing.Annotated`, with short examples only where accepted selector forms or URI shapes are genuinely ambiguous. + +Rationale: +- Plain strings are simple to read in source and easy for future tooling to consume. +- The current motivation is help text, not a richer schema for parser generation. + +Alternatives considered: +- Introduce a structured metadata object for parameters. Rejected because it adds API and maintenance overhead before there is a concrete consumer that requires it. + +## Risks / Trade-offs + +- [Documentation drift] → Keep docstrings short and focused on behavior while treating signatures and `Annotated` strings as the canonical parameter surface. +- [Over-annotated signatures become noisy] → Use concise help strings and reserve inline examples for ambiguous selector or URI parameters. +- [Future CLI generation may want richer metadata] → Start with plain-string `Annotated` values that are easy to migrate or wrap later if a stronger schema becomes necessary. +- [Underscored namespace class names still appear in some introspection output] → Accept that presentation limitation for now and keep the change focused on documentation and metadata. diff --git a/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/proposal.md b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/proposal.md new file mode 100644 index 0000000..187802e --- /dev/null +++ b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/proposal.md @@ -0,0 +1,27 @@ +## Why + +The shared `Dml` surface is now the canonical orchestration boundary for CLI and API workflows, but its runtime introspection story is still sparse: public methods and namespace objects largely lack docstrings, and parameter intent is not captured in machine-readable form. We want to make the `Dml` surface self-describing now so future tooling can programmatically derive CLI help and related introspection without redefining command semantics in `_cli`. + +## What Changes + +- Add class docstrings to the public `Dml` class and the namespace objects reachable from it so introspection can describe the purpose of each command group. +- Add method docstrings throughout the public `Dml` surface, including namespaced methods, so introspection can describe operation behavior, constraints, and side effects. +- Add `typing.Annotated` metadata to user-facing `Dml` and namespace method parameters so parameter meaning is available as structured help text for future CLI generation and related tooling. +- Define the documentation split for this surface: signature defaults remain the source of truth for default values, `Annotated` metadata documents parameter meaning, and docstrings document class/namespace purpose plus method behavior. +- Keep runtime behavior, CLI grammar, and output payloads unchanged in this change. + +## Capabilities + +### New Capabilities + +None. + +### Modified Capabilities + +- `unified-dml-surface`: Add introspection-oriented documentation and parameter metadata requirements for the shared `Dml` boundary and its public namespaces. + +## Impact + +- Affects `src/daggerml/_internal/dml.py` and any public wrappers or tests that assert `Dml` signature and documentation behavior. +- Does not change repository state formats, runtime execution behavior, or current CLI command semantics. +- Prepares the `Dml` surface for future programmatic CLI derivation by making descriptions available directly on classes, methods, and parameters. diff --git a/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..2dadc43 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/specs/unified-dml-surface/spec.md @@ -0,0 +1,28 @@ +## ADDED Requirements + +### Requirement: Shared `Dml` surface SHALL be introspection-ready +The shared `Dml` boundary and its public namespaces SHALL expose runtime documentation that explains class purpose, method behavior, and parameter meaning without changing workflow semantics. + +#### Scenario: Namespace objects describe their purpose +- **WHEN** a caller inspects `Dml` or any namespace reachable through `dml.config`, `dml.runtime`, `dml.dag`, or `dml.admin` +- **THEN** the class exposes a docstring that describes the purpose of that boundary or namespace + +#### Scenario: Public methods describe behavior +- **WHEN** a caller inspects a public top-level or namespaced `Dml` method +- **THEN** the method exposes a docstring that describes the operation behavior and any notable constraints or side effects + +### Requirement: Shared `Dml` parameters SHALL expose machine-readable help metadata +Public parameters on the shared `Dml` surface and its public namespace methods SHALL use `typing.Annotated` metadata to describe parameter meaning, while Python signature defaults remain the source of truth for default values. + +#### Scenario: Parameter meaning is available from annotations +- **WHEN** a caller inspects annotations for a public `Dml` method or a public method on a `Dml` namespace object with extras included +- **THEN** the parameter annotations include `Annotated` metadata that describes what each user-facing parameter means + +#### Scenario: Defaults remain in the signature +- **WHEN** a public `Dml` or namespaced method has a defaulted parameter +- **THEN** the default value remains represented by the Python signature +- **AND** the `Annotated` metadata does not become the source of truth for that default + +#### Scenario: Ambiguous selector parameters may include examples +- **WHEN** a public `Dml` parameter accepts potentially confusing selector or URI forms such as revision selectors or remote project identifiers +- **THEN** the `Annotated` metadata MAY include concise examples that clarify accepted forms without redefining the underlying grammar diff --git a/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/tasks.md b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/tasks.md new file mode 100644 index 0000000..7524b39 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-document-dml-cli-introspection/tasks.md @@ -0,0 +1,16 @@ +## 1. Add top-level `Dml` introspection metadata + +- [x] 1.1 Add a class docstring to `Dml` that explains its role as the shared orchestration boundary. +- [x] 1.2 Add concise method docstrings to the public top-level `Dml` methods describing behavior, constraints, and notable side effects. +- [x] 1.3 Add `typing.Annotated` help metadata to user-facing parameters on the public top-level `Dml` methods, keeping defaults in the signatures. + +## 2. Add namespace introspection metadata + +- [x] 2.1 Add class docstrings to the public namespace classes reachable from `Dml`, including admin sub-namespaces. +- [x] 2.2 Add concise method docstrings to public namespace methods under `config`, `runtime`, `dag`, and `admin`. +- [x] 2.3 Add `typing.Annotated` help metadata to user-facing parameters on public namespace methods, including concise examples for ambiguous selector or URI inputs. + +## 3. Verify the introspection contract + +- [x] 3.1 Add or update tests that inspect public `Dml` and namespace docstrings plus `Annotated` metadata with extras included. +- [x] 3.2 Run the relevant test suite and confirm the change preserves existing runtime and CLI behavior while exposing the new introspection metadata. diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/.openspec.yaml b/openspec/changes/archive/2026-05-17-make-remote-project-optional/.openspec.yaml new file mode 100644 index 0000000..66da1ae --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-17 diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/design.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/design.md new file mode 100644 index 0000000..bd0f991 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/design.md @@ -0,0 +1,62 @@ +## Context + +Current repository bootstrap and local project-config helpers still treat `remote.project` as part of repository identity. That conflicts with the existing runtime model, where many mutation and execution paths only need `remote.root`, while project sync flows need both `remote.root` and `remote.project`. + +The change crosses shared config resolution, local config persistence, init entrypoints, and project-sync guards. It also removes the older name-derived init path in favor of explicit optional remote configuration. + +## Goals / Non-Goals + +**Goals:** + +- Allow valid local repos whose config contains `remote.root` but omits `remote.project`. +- Make `remote.root` the capability gate for remote-backed mutation and execution. +- Make `remote.project` the capability gate for project-addressed sync behavior. +- Simplify init so it accepts only optional `remote_project` and optional `remote_root`. +- Restrict init-time fetch/checkout to cases where `remote.project` is configured. + +**Non-Goals:** + +- Changing remote ref layout or remote protocol payloads. +- Making remote-backed mutation work without `remote.root`. +- Adding alternate ways to derive project identity during init. + +## Decisions + +### Treat local `remote.project` as optional publication metadata + +Local repository validity will no longer require `remote.project`. Shared config resolution will continue validating branchless URI shape when the value is present, but local config loaders and helper accessors must tolerate absence. + +Alternative considered: keep `remote.project` mandatory in local config and only loosen init. Rejected because it preserves the same invalid local state boundary and continues conflating repo existence with publication identity. + +### Split capability checks by operation class + +Operations that create or mutate remote-backed runtime state will require `remote.root`. Project-addressed sync operations such as push, pull, fetch, and init-time checkout/fetch will additionally require `remote.project`. + +Alternative considered: continue relying on config-loader failures in sync paths. Rejected because it yields the wrong semantics and error boundary. + +### Remove name-derived init identity + +`Dml.init()` and CLI init will stop accepting `name`. Init will accept optional `remote_project` and optional `remote_root`, reject `remote_project` without `remote_root`, persist config when remote settings are provided, and skip fetch/checkout when `remote.project` is absent. + +Alternative considered: keep `name` as shorthand for deriving `remote.project`. Rejected because it reintroduces implicit publication identity and user-resolution requirements that this change is removing. + +### Keep recovery/bootstrap conditional on configured project identity + +Recovery for missing local DB state remains valid without `remote.project`. If `remote.project` is configured, init may fetch or check out project state using the configured remote context. If not, recovery only restores local repository state. + +## Risks / Trade-offs + +- [Local helpers may still assume project identity exists] -> Mitigation: audit `DmlProjectConfig` consumers and add explicit `remote.project` capability checks in project-sync paths. +- [Error behavior may shift from load-time failures to operation-time failures] -> Mitigation: define targeted spec requirements and tests for missing `remote.root` vs missing `remote.project`. +- [Removing `name` from init is breaking for callers and docs] -> Mitigation: capture the break explicitly in proposal/specs and update CLI/API contracts together. + +## Migration Plan + +- Update specs and docs first so the capability model is explicit. +- Change init entrypoints and local config helpers to allow missing `remote.project`. +- Move project sync validation to explicit operation guards. +- Update tests from name-derived init expectations to optional remote-project expectations. + +## Open Questions + +None. diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/proposal.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/proposal.md new file mode 100644 index 0000000..19a16cc --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/proposal.md @@ -0,0 +1,31 @@ +## Why + +Local repositories currently conflate remote transport configuration with project publication identity. That prevents valid repos from initializing unless they can derive or provide `remote.project`, even though many runtime and mutation flows only require `remote.root`. + +## What Changes + +- Make local `remote.project` optional while keeping local `remote.root` as the capability gate for remote-backed mutation and execution. +- Change init semantics so `Dml.init()` no longer accepts `name`, accepts optional `remote_project` and optional `remote_root`, and rejects `remote_project` without `remote_root`. +- Restrict project-addressed sync operations such as push, pull, fetch, and init-time checkout/fetch to repositories with configured `remote.project`. +- Preserve recovery/bootstrap behavior so init only fetches or checks out project state when `remote.project` is configured. +- **BREAKING** Remove name-derived init identity flow and replace it with explicit optional `remote_project` configuration. + +## Capabilities + +### New Capabilities + +None. + +### Modified Capabilities + +- `shared-internal-configuration`: allow local project config to omit `remote.project` while preserving branchless validation when it is set. +- `init-input-normalization`: remove `name`-based init identity rules and redefine init inputs around optional `remote_project` and `remote_root`. +- `dmlops-init-recovery`: only fetch or pull during init recovery when `remote.project` is configured. +- `required-remote-config`: distinguish operations that require `remote.root` from project sync operations that additionally require `remote.project`. +- `remote-project-refs`: require configured `remote.project` before project-addressed push/pull/fetch/checkout behavior. + +## Impact + +- Affected code: shared config resolution, local project-config load/save helpers, `Dml.init`, `DmlOps.init`, and project sync operation guards. +- Affected APIs: Python `Dml.init`, CLI `dml init`, and error behavior for project sync commands without `remote.project`. +- Affected docs/specs: configuration, init, and remote sync capability contracts. diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/dmlops-init-recovery/spec.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/dmlops-init-recovery/spec.md new file mode 100644 index 0000000..508a36c --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/dmlops-init-recovery/spec.md @@ -0,0 +1,12 @@ +## MODIFIED Requirements + +### Requirement: Recovery mode pulls when a project URI is configured +The system SHALL fetch and check out project bootstrap state during recovery only when resolved configuration includes `remote.project`. + +#### Scenario: Recovery fetches project state when project URI is present +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config includes `remote.project` +- **THEN** it uses resolved remote and project configuration to fetch project state and check out the fetched revision locally + +#### Scenario: Recovery skips fetch and checkout when project URI is absent +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config has no `remote.project` +- **THEN** it creates local DB state without invoking project fetch, pull, or checkout diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/init-input-normalization/spec.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/init-input-normalization/spec.md new file mode 100644 index 0000000..85c07b8 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/init-input-normalization/spec.md @@ -0,0 +1,37 @@ +## REMOVED Requirements + +### Requirement: Init identity inputs are mutually exclusive +**Reason**: Init no longer accepts `name` as an alternate identity source. +**Migration**: Pass `remote_project` explicitly when project publication identity is needed. + +### Requirement: Init accepts URI-only identity +**Reason**: Init now accepts `remote_project` as an optional capability input rather than as the sole way to omit `name`. +**Migration**: Continue passing `remote_project` when desired, but do not pass `name`. + +### Requirement: Init derives URI from name using resolved user +**Reason**: Name-derived project identity is removed. +**Migration**: Provide `remote_project` explicitly instead of relying on user-derived URI generation. + +### Requirement: Name-based init fails when user cannot be resolved +**Reason**: Init no longer derives project identity from user configuration. +**Migration**: Omit project identity for local-only init or pass explicit `remote_project`. + +## ADDED Requirements + +### Requirement: Init accepts optional remote capabilities +The init operation MUST accept optional `remote_project` and optional `remote_root` inputs. Init MUST allow both values to be omitted for local read-only repository bootstrap. + +#### Scenario: Init without remote configuration +- **WHEN** init is called with no `remote_project` and no `remote_root` +- **THEN** init succeeds without deriving or persisting project publication identity + +#### Scenario: Init with remote root only +- **WHEN** init is called with `remote_root` and no `remote_project` +- **THEN** init succeeds and configures remote-backed mutation and execution capability without project sync capability + +### Requirement: Init rejects project identity without remote root +The init operation MUST reject `remote_project` when `remote_root` is absent. + +#### Scenario: Project URI without remote root +- **WHEN** init is called with `remote_project` and no `remote_root` +- **THEN** init fails with a descriptive validation error stating that `remote.root` is required when `remote.project` is configured diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/remote-project-refs/spec.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..23f8fd3 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/remote-project-refs/spec.md @@ -0,0 +1,16 @@ +## ADDED Requirements + +### Requirement: Project sync commands require configured local project URI +The system SHALL require configured local `remote.project` before resolving default project-addressed remote refs for push, pull, fetch, or checkout flows. + +#### Scenario: Push without configured project URI +- **WHEN** a repository has `remote.root` but no `remote.project` and push is requested +- **THEN** push fails with a descriptive error stating that `remote.project` is required for project sync + +#### Scenario: Pull without configured project URI +- **WHEN** a repository has `remote.root` but no `remote.project` and pull or fetch-by-project is requested +- **THEN** the operation fails with a descriptive error stating that `remote.project` is required for project sync + +#### Scenario: Checkout on init requires configured project URI +- **WHEN** init resolves `remote.root` but not `remote.project` +- **THEN** init does not attempt project-addressed fetch or checkout diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/required-remote-config/spec.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/required-remote-config/spec.md new file mode 100644 index 0000000..9b73f5f --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/required-remote-config/spec.md @@ -0,0 +1,33 @@ +## MODIFIED Requirements + +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST receive normalized `remote.root` configuration from the shared internal configuration resolver rather than reading raw environment variables or project config files themselves. + +#### Scenario: Remote-aware ops constructor requires remote URI +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete normalized remote URI argument rather than an optional remote parameter + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +#### Scenario: Remote-aware component does not resolve env vars directly +- **WHEN** a remote-aware runtime or ops component is used in a remote-backed flow +- **THEN** it receives already-resolved remote configuration from its caller instead of inspecting raw remote environment variables or project config files directly + +#### Scenario: Init fails when required remote URI cannot resolve validly +- **WHEN** `DmlOps.init` requires remote-backed bootstrap behavior and shared config resolution does not produce a valid `remote.root` +- **THEN** init fails with a configuration error instead of proceeding with unresolved or implicit remote configuration + +## ADDED Requirements + +### Requirement: Project sync operations require project identity in addition to remote root +The system SHALL require configured `remote.project` for project-addressed sync behavior such as push, pull, fetch, and init-time project checkout. These operations MUST fail closed when `remote.root` exists but `remote.project` is absent. + +#### Scenario: Remote-backed mutation without project identity remains allowed +- **WHEN** a runtime or mutation operation requires only remote-backed storage or execution capability +- **THEN** configured `remote.root` is sufficient even when `remote.project` is absent + +#### Scenario: Project sync operation without project identity is rejected +- **WHEN** a project-addressed sync operation is requested and resolved config has no `remote.project` +- **THEN** the operation fails with a descriptive error instead of deriving project identity implicitly diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..cc3f426 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/specs/shared-internal-configuration/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize and canonicalize local `remote.project` as an optional branchless project identity through shared revision URI utilities. Resolved configuration SHALL treat checkout state as repository state owned by `.dml/HEAD` rather than as a selector embedded in config. + +#### Scenario: Local project URI remains branchless when configured +- **WHEN** `remote.project` is resolved for local project configuration +- **THEN** shared configuration preserves canonical branchless form `dml:///` + +#### Scenario: Local project configuration may omit project URI +- **WHEN** local project configuration omits `remote.project` +- **THEN** shared configuration resolves successfully without deriving project identity from other inputs + +#### Scenario: Tag or branch selector is not accepted for local project config +- **WHEN** local project configuration provides `remote.project` with a branch or tag selector +- **THEN** configuration resolution fails instead of translating that selector into checkout state + +#### Scenario: Project helper accessors do not expose current checkout branch +- **WHEN** resolved configuration includes `remote.project` +- **THEN** helper accessors expose project identity only and do not treat config as the source of the active branch or detached commit diff --git a/openspec/changes/archive/2026-05-17-make-remote-project-optional/tasks.md b/openspec/changes/archive/2026-05-17-make-remote-project-optional/tasks.md new file mode 100644 index 0000000..17cf12a --- /dev/null +++ b/openspec/changes/archive/2026-05-17-make-remote-project-optional/tasks.md @@ -0,0 +1,17 @@ +## 1. Init Contract + +- [x] 1.1 Remove `name`-based init inputs from Python and CLI entrypoints and accept only optional `remote_project` and optional `remote_root`. +- [x] 1.2 Enforce init validation that rejects configured `remote_project` when `remote_root` is absent. +- [x] 1.3 Update init bootstrap behavior so fetch/checkout runs only when `remote.project` is configured. + +## 2. Local Config And Capability Gates + +- [x] 2.1 Update shared config and local project-config helpers to allow missing local `remote.project` while preserving branchless validation when present. +- [x] 2.2 Add explicit project-sync guards so push, pull, fetch, and related flows fail with targeted errors when `remote.project` is absent. +- [x] 2.3 Preserve `remote.root` as the required capability for remote-backed mutation and execution paths. + +## 3. Tests And Docs + +- [x] 3.1 Replace name-derived init tests with coverage for local-only init, `remote_root`-only init, and `remote_project`-without-`remote_root` rejection. +- [x] 3.2 Add coverage proving project sync commands fail when `remote.project` is absent while remote-backed mutation flows remain allowed with only `remote.root`. +- [x] 3.3 Update configuration, init, and remote-sync documentation to describe the new capability split. diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/.openspec.yaml b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/.openspec.yaml new file mode 100644 index 0000000..66da1ae --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-17 diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/design.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/design.md new file mode 100644 index 0000000..11e29a7 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/design.md @@ -0,0 +1,177 @@ +## Context + +The current runtime stores launch-time resume data, lifecycle state, dependency summaries, and cancellation metadata in one mutable execution object. `IndexOps.start_fn`, cancellation code, and execution-runtime update paths all write into that object, which makes ownership hard to reason about and forces `Dml.runtime.cancel(index_id)` to synchronously drive adapter cancellation work from a runtime that may not own the adapter context or permissions. + +This change refactors the runtime around two durable records with explicit owners: + +- `launch_state`: caller-owned launch/resume state protected by the cache-key lock +- `execution_record`: execution-runtime-owned lifecycle state protected by CAS + +The naming is intentional. `launch_state` describes resumable launch state only. `execution_record` describes durable control-plane lifecycle state only. The change also renames cancellation lifecycle values so they no longer imply that backend shutdown has already completed. + +## Goals / Non-Goals + +**Goals:** +- separate launch/resume ownership from lifecycle/cancellation ownership +- make caller and execution-runtime write authority explicit +- keep `active/` semantics focused on whether a cached computation has a current execution attempt +- let user-triggered `dml.runtime.cancel(index_id)` work without an active caller execution context, using `config.user` as the cancellation requester +- preserve live caller edges for invalidation/orphan checks while preserving runtime-owned spawned execution lists for cancellation traversal +- make cancellation best-effort, bounded, and explicit about what the runtime guarantees + +**Non-Goals:** +- guaranteeing that a `cancel-*` lifecycle means the backend process is already dead +- reconstructing historical runtimes solely to continue cancellation through terminal intermediates +- collapsing live caller edges and spawned execution lists into one graph structure +- removing cache-key locking from caller-owned launch-state transitions + +## Decisions + +### Decision: Split execution persistence into `launch_state` and `execution_record` + +`launch_state` will contain: + +- `execution_id` +- `cache_key` +- `resume_state` +- `created_at` + +`execution_record` will contain: + +- `execution_id` +- `cache_key` +- `lifecycle` +- `updated_at` +- `spawned_execution_ids` +- `cancellation_requested_by` + +Rationale: +- `launch_state` is a caller-owned resumption handle tied to `active/`. +- `execution_record` is the execution runtime's durable control-plane state. +- separating them eliminates the current mixed ownership where caller runtimes, cancellation code, and execution runtimes all mutate the same object. + +Alternatives considered: +- keep one monolithic execution object with stricter write discipline: rejected because it still couples unrelated invariants and leaves status/state confusion in place. +- keep resume state only in memory: rejected because multiprocessing and distributed runtimes need durable resume data. + +### Decision: Keep cache-key locking for `launch_state` and use CAS for `execution_record` + +The cache-key lock remains the serialization point for: + +- creating or reusing `active/` +- reading and writing `launch_state` +- removing the active pointer during orphan-triggered cancellation transitions + +`execution_record` remains independently CAS-updated with the latest ETag kept in memory by the owning runtime. If a CAS write fails due to ETag drift, the runtime rereads the record. It raises the cancellation exception only when the reread lifecycle is already a `cancel-*` value; otherwise it continues with the valid reread state. + +Rationale: +- `launch_state` and `active/` are cross-object caller-owned invariants and need lock serialization. +- `execution_record` is mostly single-owner state and benefits from lock-free CAS updates. + +Alternatives considered: +- use one lock for both objects: rejected because it would re-couple caller-owned and runtime-owned lifecycles. +- use CAS for `active/` transitions too: rejected because the active pointer and launch state need one caller-owned lock boundary anyway. + +### Decision: Use `cancel-pending` and `cancel-detached` lifecycle values + +The runtime will replace `cancel-requested` and `cancelled` with: + +- `cancel-pending`: cancellation has been requested and must be observed by the execution runtime +- `cancel-detached`: the runtime completed its cancellation responsibilities and detached this execution from current ownership + +`cancel-detached` does not mean the backend process is already dead. It means: + +- the runtime removed `active/` +- future callers should create a new execution attempt instead of reusing this one +- any remaining backend shutdown is delegated to the adapter/executor contract + +The `cancel-*` prefix is intentional so cancellation-aware write paths can cheaply identify cancellation lifecycles after an ETag reread. + +Alternatives considered: +- keep `cancelled`: rejected because it implies a stronger guarantee than the runtime actually provides. +- use a non-prefixed detached term such as `detached`: rejected because the `cancel-*` prefix is useful in ETag-drift handling and is more obviously cancellation-related. + +### Decision: Treat cancellation as out-of-band control-plane work + +`dml.runtime.cancel(index_id)` is a user-triggered out-of-band workflow, not an in-band execution path. When called directly by a user, there is no active caller `execution_id`; in that case `cancellation_requested_by` is the configured user identity. + +The cancellation flow is: + +1. freeze the index so further mutation stops +2. read the root `execution_record.spawned_execution_ids` +3. remove caller-owned live caller edges for direct dependencies +4. for each callee that loses its last live caller: + - acquire the callee cache-key lock + - recheck that no live callers remain + - confirm the callee is not terminal + - remove `active/` + - CAS `execution_record.lifecycle -> cancel-pending` + - set `cancellation_requested_by` + - release the lock +5. issue adapter cancellation fire-and-forget for queued cancellable executions +6. CAS those executions to `cancel-detached` +7. write cancellation tombstones with if-none-match protection +8. mark the root as `cancel-detached` and raise `CancelledExecutionError` + +Rationale: +- the cancelling runtime is not guaranteed to have full adapter permissions or ownership. +- adapters are already required to process cancellation out of band and fully on their own side. +- removing the active pointer is the key signal that future callers must relaunch rather than resume. + +Alternatives considered: +- keep synchronous cancellation until the full adapter chain confirms completion: rejected because it is chunky, permission-sensitive, and couples cancellation correctness to the caller runtime. +- defer all status mutation until adapters confirm shutdown: rejected because the runtime still needs to revoke current-execution ownership immediately. + +### Decision: Keep two graph structures with different owners and meanings + +`live-callers//` remains caller-owned and represents current inbound callers. It is used for orphan detection and invalidation. + +`execution_record.spawned_execution_ids` remains runtime-owned and represents the children started by that execution for cancellation traversal. + +Rationale: +- these answer different questions and should not be conflated. +- live caller edges can shrink as callers cancel or disappear. +- spawned execution lists are historical execution summaries used for best-effort cancellation traversal. + +Alternatives considered: +- make cancellation dependencies in-memory only: rejected because multiprocessing and distributed execution need durable traversal state. +- use live caller edges as the only dependency source: rejected because edge removal intentionally discards information that cancellation traversal still needs. + +### Decision: Accept best-effort cancellation limits through terminal intermediates + +The design accepts that `A -> B -> C` may leave `C` running if `B` has already gone terminal before `A` cancels and there is no practical runtime reconstruction path to continue propagation. + +Rationale: +- reconstructing historical runtime context would require additional machinery and may be more expensive than leaving the descendant alone. +- the design optimizes for bounded, ownership-correct cancellation over perfect retrospective traversal. + +Alternatives considered: +- recreate terminal intermediates solely to continue cancellation propagation: rejected for cost and complexity. +- maintain a stronger transitive cancellation graph with full replay metadata: rejected as too heavy for this refactor. + +## Risks / Trade-offs + +- [Cancellation semantics are weaker than the old name implied] -> rename the lifecycle values to `cancel-pending` and `cancel-detached`, and document that `cancel-detached` is a control-plane guarantee rather than proof of backend exit. +- [New callers may relaunch work while detached backend cleanup is still happening] -> make removal of `active/` an intentional part of the contract and require adapters to make their own cancellation side idempotent. +- [Best-effort traversal may miss descendants behind terminal intermediates] -> document this as an accepted limitation and cover it with explicit contract tests. +- [Two graph structures can drift if their roles are misunderstood] -> specify ownership and purpose separately in specs and tests. +- [CAS retries can race with non-cancellation updates] -> only treat ETag drift as terminal for cancellation when the reread lifecycle is already `cancel-*`; otherwise continue with the latest valid record. + +## Migration Plan + +1. Introduce the new record names, lifecycle names, and storage helpers behind the existing runtime flow. +2. Move `start_fn` launch/resume paths to `launch_state` while keeping active-pointer semantics intact. +3. Move lifecycle, dependency, and cancellation fields to `execution_record` and update executor envelopes. +4. Refactor `dml.runtime.cancel(index_id)` to the out-of-band orphan-detection and detach flow. +5. Update executor contracts and tests to return detached-style cancellation results. +6. Remove legacy monolithic execution-record assumptions once all contract tests pass. + +Rollback strategy: +- revert the refactor as one change set before archive if contract coverage reveals incompatible runtime assumptions. +- no persistent data migration rollback is required yet because the change can land atomically with its new contract tests. + +## Open Questions + +- whether the adapter envelope field should remain named `execution_status` for compatibility while carrying the new lifecycle values +- whether `cancel-detached` should be persisted by the caller-side cancellation workflow, the execution runtime after observing `cancel-pending`, or both under tightly scoped CAS rules +- whether index-root `execution_record` entries should keep using the index id directly or be renamed separately in a later cleanup diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/proposal.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/proposal.md new file mode 100644 index 0000000..8135ac9 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/proposal.md @@ -0,0 +1,30 @@ +## Why + +The current runtime cancellation model mixes launch-state ownership, lifecycle ownership, and cancellation orchestration into one mutable execution object. That makes cancellation chunky, requires the cancelling runtime to synchronously drive adapter cancellation with permissions it may not have, and blurs which runtime is allowed to mutate which execution fields. + +## What Changes + +- Split the current execution object into two durable records: caller-owned `launch_state` and runtime-owned `execution_record`. +- Redefine cancellation as an out-of-band control-plane workflow that removes current-execution ownership, marks cancellation intent via CAS, and delegates final shutdown handling to adapters/executors. +- Rename lifecycle fields and statuses to make the weaker cancellation guarantee explicit, including replacing `cancelled` with a detached-state name that does not imply backend process termination. +- Preserve two distinct graph structures: caller-owned live caller edges for invalidation and orphan detection, and runtime-owned spawned execution lists for cancellation traversal. +- Update `Dml.runtime.cancel(index_id)` semantics to operate without an active caller `execution_id`, using `config.user` as the cancellation requester when invoked directly by a user. +- Document accepted best-effort cancellation limits, including the case where descendants behind already-terminal intermediates may not be cancelled. + +## Capabilities + +### New Capabilities + + +### Modified Capabilities +- `execution-state`: redefine the S3 execution coordination contract around caller-owned `launch_state`, cache-key locking, and active-pointer removal during cancellation. +- `runtime-execution-records`: replace the monolithic execution object with `launch_state` and `execution_record`, rename lifecycle fields/statuses, and update cancellation CAS semantics. +- `executor-cancellation`: align executor cancellation with out-of-band `cancellation-pending` updates and detached completion semantics. +- `execution-call-edges`: clarify that live caller edges are caller-owned and distinct from runtime-owned cancellation dependencies. +- `unified-dml-surface`: update `dml.runtime.cancel(index_id)` requirements for direct user-triggered cancellation with no active execution context. + +## Impact + +- Affected code: `src/daggerml/_internal/ops/index.py`, `src/daggerml/_internal/exec_state.py`, `src/daggerml/_internal/dml.py`, executor implementations, and runtime contract tests. +- Affected APIs/contracts: runtime execution persistence, adapter envelope lifecycle fields, executor cancellation behavior, and `dml.runtime.cancel(index_id)` semantics. +- Affected systems: S3-backed execution coordination, invalidation lineage, adapter/executor cancellation flow, and contract/spec documentation. diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/review.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/review.md new file mode 100644 index 0000000..6e6f795 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/review.md @@ -0,0 +1,46 @@ +## Review Findings + +### 1. Cache-key lock can leak on cancellation races + +`IndexOps.start_fn()` only unlocks around `_call_adapter()` exceptions, but both `_record_call_edges()` and the subsequent `es.update_execution_record(...)` can raise `CancelledExecutionError` via `ExecutionState.update_execution_record()` when the record has already flipped to a `cancel-*` lifecycle. In that path the function exits without `es.unlock()`, leaving the lock held until TTL expiry and stalling future launches for that cache key. + +- Files: + - `src/daggerml/_internal/ops/index.py:181-199` + - `src/daggerml/_internal/exec_state.py:380-394` + +### 2. `dml.runtime.cancel()` now fails healthy long-running cancellations after three retries + +The new retry cap applies not just to transport errors, but also to normal `outcome is None` cases and lock contention. A long-running adapter-side cancel or a briefly busy lock now raises `DmlRepoError` even though cancellation is progressing correctly out of band. The agreed deviation was adapter-call retries; this loop-level retry budget changes cancellation semantics more broadly. + +- File: + - `src/daggerml/_internal/dml.py:569-635` + +### 3. Cancellation still invokes adapters synchronously while holding the callee lock + +`_cancel_execution_candidate()` deletes the active pointer, updates the record, drops child edges, and calls `_invoke_cancel_update()` before releasing the callee cache-key lock. That reintroduces the original ownership and permission problem the refactor was meant to remove, and blocks new callers on the same cache key for the full adapter-call duration. + +- Files: + - `src/daggerml/_internal/ops/index.py:919-995` + - `src/daggerml/_internal/ops/index.py:1032-1053` + +### 4. `CancelledExecutionError` still inherits from `DmlRepoError` + +The design called for a cancellation interruption that is not a `daggerml.Error`. Keeping `CancelledExecutionError` as a `DmlRepoError` means it still sits inside the normal error hierarchy and risks being treated like a domain or repository failure instead of a distinct control-plane interruption. + +- Files: + - `src/daggerml/_internal/exec_state.py:35` + - `src/daggerml/_internal/types.py:455` + +### 5. Stale active-pointer recovery now crashes instead of relaunching + +In `start_fn`, the runtime clears `execution_id` after detecting that the active execution is stale because `launch_state` is missing or the lifecycle is terminal, but then immediately asserts `execution_record is None`. That assertion is false in the stale-pointer cases where the `execution_record` still exists, so a recoverable relaunch path turns into an `AssertionError`. + +- File: + - `src/daggerml/_internal/ops/index.py:144-156` + +### 6. `dml.runtime.cancel()` no longer paces normal retry loops + +The current implementation only applies backoff when `_cancel_execution_candidate()` raises an exception. Normal cancellation-progress cases such as `lock_retry=True` or `outcome is None` immediately spin the outer loop without the planned sleep interval, which can create a hot loop against shared state while waiting for out-of-band cancellation work to progress. + +- File: + - `src/daggerml/_internal/dml.py:581-629` diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-call-edges/spec.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-call-edges/spec.md new file mode 100644 index 0000000..0f9cd2a --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-call-edges/spec.md @@ -0,0 +1,21 @@ +## ADDED Requirements + +### Requirement: Live caller edges SHALL be caller-owned and removable +The runtime SHALL treat `exec/edges//.json` as a live caller edge owned by the caller runtime. The caller runtime that created the edge SHALL be allowed to remove that edge when it cancels or otherwise stops being a caller of the callee execution. + +#### Scenario: Caller cancellation removes its own live edge +- **WHEN** caller execution `e0` is cancelled after creating edge `exec/edges/e1/e0.json` +- **THEN** the runtime handling `e0` cancellation SHALL be allowed to remove that edge + +#### Scenario: Other callers preserve callee liveness +- **WHEN** caller `e0` removes its edge to callee `e1` +- **AND** another live edge for `e1` still exists +- **THEN** the runtime SHALL continue to treat `e1` as having live callers + +### Requirement: Live caller edges and spawned execution ids SHALL remain distinct +The runtime SHALL use live caller edges for reverse-lineage invalidation and orphan detection, and SHALL use `execution_record.spawned_execution_ids` for cancellation traversal. Removal of a live caller edge SHALL NOT remove the callee from the caller's historical spawned execution summary. + +#### Scenario: Removing live edge preserves historical cancellation dependency +- **WHEN** caller `e0` removes its live edge to callee `e1` during cancellation +- **THEN** `e1` MAY still remain in `e0`'s `spawned_execution_ids` +- **AND** the runtime SHALL continue treating those structures as distinct sources of truth diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-state/spec.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-state/spec.md new file mode 100644 index 0000000..a88624a --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/execution-state/spec.md @@ -0,0 +1,27 @@ +## ADDED Requirements + +### Requirement: Caller-owned launch state SHALL be serialized by cache-key lock +The runtime SHALL persist caller-owned `launch_state` for each execution attempt separately from lifecycle state. `launch_state` SHALL contain `execution_id`, `cache_key`, `resume_state`, and `created_at`. The runtime SHALL create and update `launch_state` only while holding the coordination lock for the corresponding `cache_key`. + +#### Scenario: First running launch persists launch state under lock +- **WHEN** `start_fn` launches a new execution and receives a `running` adapter result with durable resume data +- **THEN** it SHALL persist `launch_state` containing `execution_id`, `cache_key`, `resume_state`, and `created_at` +- **AND** it SHALL do so while holding the lock for that `cache_key` + +#### Scenario: Resume reads launch state under lock +- **WHEN** `start_fn` resumes an execution referenced by `active/` +- **THEN** it SHALL read that execution's `launch_state` while holding the lock for that `cache_key` +- **AND** it SHALL pass `resume_state` from `launch_state` to the adapter + +### Requirement: Cancellation orphaning SHALL remove current-execution ownership under lock +When cancellation leaves an execution with no remaining live callers, the runtime SHALL acquire the coordination lock for that execution's `cache_key`, recheck that no live callers remain, ensure the execution is not terminal, and remove `active/` before marking cancellation intent on lifecycle state. + +#### Scenario: Orphaned callee loses active pointer before cancellation lifecycle update +- **WHEN** cancellation removes the last live caller edge for callee execution `e1` +- **THEN** the runtime SHALL lock the coordination key for `e1`'s `cache_key` +- **AND** it SHALL delete `active/` before setting the callee lifecycle to a `cancel-*` value + +#### Scenario: New caller relaunches after detached cancellation +- **WHEN** a later caller computes the same `cache_key` after the prior execution was cancellation-detached and `active/` is absent +- **THEN** the runtime SHALL treat the computation as having no current execution +- **AND** it SHALL create a fresh execution attempt instead of resuming the detached one diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/executor-cancellation/spec.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/executor-cancellation/spec.md new file mode 100644 index 0000000..bf4bc16 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/executor-cancellation/spec.md @@ -0,0 +1,28 @@ +## MODIFIED Requirements + +### Requirement: Executors SHALL handle `cancel-requested` as an update step +When the runtime invokes an executor with `execution_status = "cancel-pending"`, the executor SHALL treat that invocation as a cancellation update rather than as a fresh launch. Executors that normally dispatch to `runnable.sub` during update SHALL continue to dispatch to `runnable.sub` once in cancellation mode before performing executor-owned cleanup. Executors that do not normally dispatch to `runnable.sub` during update SHALL cancel their own external resources directly. + +#### Scenario: Update-dispatch executor forwards cancellation update +- **WHEN** an executor that normally calls `runnable.sub` on update receives `execution_status = "cancel-pending"` +- **THEN** it SHALL issue its normal update-time sub-dispatch once before executor-owned cleanup + +#### Scenario: Detached-work executor cancels backend directly +- **WHEN** an executor that does not normally call `runnable.sub` on update receives `execution_status = "cancel-pending"` +- **THEN** it SHALL cancel or tear down its own external work without invoking `runnable.sub` + +### Requirement: Successful cancel updates SHALL report `cancelled` +When an executor processes a cancel update without transport or runtime exceptions, it SHALL return `status = "cancel-detached"` even if backend cleanup or rollback continues asynchronously. The runtime cancellation workflow SHALL treat that result as confirmation that the cancel update was handled and ownership was detached rather than as a successful DAG execution result. + +#### Scenario: Cancel update reports detached success after teardown request +- **WHEN** an executor successfully processes a `cancel-pending` update +- **THEN** it SHALL return `status = "cancel-detached"` + +## ADDED Requirements + +### Requirement: Executor cancellation SHALL honor detached completion semantics +Executors SHALL interpret `cancel-detached` as a control-plane completion signal rather than proof that backend cleanup has already finished. Executors that initiate asynchronous backend rollback or shutdown SHALL still return promptly once they have issued the required cancellation work. + +#### Scenario: Asynchronous backend rollback still returns detached status +- **WHEN** an executor starts backend rollback or shutdown that continues asynchronously +- **THEN** it SHALL still return `status = "cancel-detached"` after issuing that work successfully diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/runtime-execution-records/spec.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..eb4a8b4 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/runtime-execution-records/spec.md @@ -0,0 +1,92 @@ +## MODIFIED Requirements + +### Requirement: Runtime SHALL maintain one mutable execution object per execution id +The runtime SHALL persist one mutable lifecycle object per execution id as `execution_record`, separate from caller-owned `launch_state`. `execution_record` SHALL include `execution_id`, `cache_key`, `lifecycle`, `updated_at`, `spawned_execution_ids`, and `cancellation_requested_by`, where `cancellation_requested_by` is `str | null`. `lifecycle` SHALL be one of `running`, `cancel-pending`, `cancel-detached`, `succeeded`, or `failed`. `spawned_execution_ids` SHALL be the deduped set of child execution ids started by that execution for cancellation traversal. `execution_record` updates SHALL use compare-and-swap with the latest known ETag. If a compare-and-swap update observes ETag drift, the runtime SHALL reread the record and SHALL raise cancellation interruption only when the reread lifecycle is already a `cancel-*` value; otherwise it SHALL continue from the latest valid reread state. + +The same `execution_record` schema SHALL also be used for each live index id. For index-root records, the object path SHALL be `exec/state/.json`, `execution_id` SHALL equal the `index_id`, `cache_key` SHALL equal the `index_id`, and `spawned_execution_ids` SHALL track the deduped set of execution ids started from that index. + +The `execution_record` schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `lifecycle: "running" | "cancel-pending" | "cancel-detached" | "succeeded" | "failed"` +- `updated_at: int` +- `spawned_execution_ids: list[str]` +- `cancellation_requested_by: str | null` + +#### Scenario: Index creation creates the initial execution record +- **WHEN** `IndexOps.create` initializes a new runtime root +- **THEN** it SHALL create an `execution_record` for that root before execution starts +- **AND** that record SHALL use `execution_id = index_id` and `cache_key = index_id` + +#### Scenario: Lifecycle record does not store resume state +- **WHEN** the runtime persists `execution_record` for execution `e0` +- **THEN** it SHALL NOT store adapter resume state in that object +- **AND** resume state SHALL instead live only in caller-owned `launch_state` + +#### Scenario: CAS reread continues on non-cancellation drift +- **WHEN** a compare-and-swap update for `execution_record` observes an ETag conflict +- **AND** the reread lifecycle is `running`, `succeeded`, or `failed` +- **THEN** the runtime SHALL continue from the reread record instead of raising cancellation interruption + +#### Scenario: CAS reread raises on cancellation lifecycle drift +- **WHEN** a compare-and-swap update for `execution_record` observes an ETag conflict +- **AND** the reread lifecycle is `cancel-pending` or `cancel-detached` +- **THEN** the runtime SHALL surface cancellation interruption rather than continuing normal execution updates + +#### Scenario: Root record accumulates spawned execution ids +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `spawned_execution_ids` contains `e1` + +### Requirement: Adapter envelope and result schema SHALL follow the runtime-owned execution contract +The adapter envelope SHALL include `argv_ptr`, `cache_key`, `execution_id`, `remote`, `runnable`, `state`, `execution_status`, and `cancel_requested_by`. The adapter result SHALL use only `running`, `succeeded`, `failed`, or `cancel-detached` statuses. `running` MUST include durable `state`. `succeeded` MUST include `dag_id`. `failed` MUST include `error`. `cancel-detached` MUST identify a successful cancellation update that detached runtime ownership and MAY omit durable execution output. + +#### Scenario: First adapter call uses null state +- **WHEN** the runtime invokes an adapter for a new execution +- **THEN** the adapter envelope SHALL include `state = null` + +#### Scenario: Cancel update includes renamed cancellation lifecycle +- **WHEN** the runtime invokes an adapter for a cancel update +- **THEN** the adapter envelope SHALL include `execution_status = "cancel-pending"` +- **AND** it SHALL include `cancel_requested_by` + +#### Scenario: Cancel update may return detached status +- **WHEN** an executor completes a cancel update successfully +- **THEN** the adapter result MAY use `status = "cancel-detached"` + +#### Scenario: Pending is rejected +- **WHEN** an adapter returns `pending` +- **THEN** the runtime SHALL reject that result as invalid adapter output + +## ADDED Requirements + +### Requirement: Runtime SHALL separate caller-owned launch state from runtime-owned lifecycle state +The runtime SHALL treat `launch_state` as caller-owned state for launch and resume, and `execution_record` as execution-runtime-owned state for lifecycle, spawned execution summaries, and cancellation metadata. The caller runtime MAY transition a callee `execution_record` only to `cancel-pending` or `cancel-detached` during orphan-triggered cancellation, and SHALL NOT otherwise mutate lifecycle state owned by the callee execution runtime. + +#### Scenario: Caller runtime owns launch state updates +- **WHEN** `start_fn` launches or resumes execution `e1` +- **THEN** the caller runtime SHALL be the only path that creates or updates `launch_state` for `e1` + +#### Scenario: Execution runtime owns terminal lifecycle publication +- **WHEN** execution `e1` reaches `succeeded` or `failed` +- **THEN** the execution runtime for `e1` SHALL publish that terminal lifecycle in `execution_record` +- **AND** caller runtimes SHALL NOT publish those terminal lifecycle values for `e1` + +### Requirement: Cancellation-detached lifecycle SHALL describe runtime detachment, not backend completion +`cancel-detached` SHALL mean that the runtime completed its cancellation responsibilities for that execution, removed current-execution ownership by clearing `active/`, and delegated any remaining backend shutdown handling to the adapter or executor contract. `cancel-detached` SHALL NOT mean that external cleanup has fully completed or that the rooted execution graph is fully cancelled. + +#### Scenario: Detached lifecycle permits fresh relaunch +- **WHEN** execution `e1` is marked `cancel-detached` +- **THEN** the runtime SHALL allow a future caller for the same `cache_key` to create a new execution attempt + +#### Scenario: Detached lifecycle does not prove backend exit +- **WHEN** execution `e1` is marked `cancel-detached` +- **THEN** callers SHALL NOT infer that all external resources for `e1` have already terminated + +### Requirement: Best-effort cancellation traversal MAY stop at terminal intermediates +The runtime SHALL perform cancellation traversal from `spawned_execution_ids` on a best-effort basis. If a descendant execution is reachable only through an already-terminal intermediate runtime that is not reconstructed, the runtime MAY leave that descendant running. + +#### Scenario: Terminal intermediate prevents deeper cancellation traversal +- **WHEN** execution `A` spawned `B`, `B` spawned `C`, and `B` is already terminal before `A` is cancelled +- **THEN** the runtime MAY cancel `A` without cancelling `C` +- **AND** that outcome SHALL be treated as an accepted limitation of best-effort cancellation diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..ac297f1 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/specs/unified-dml-surface/spec.md @@ -0,0 +1,21 @@ +## ADDED Requirements + +### Requirement: Direct user cancellation SHALL use configured user identity +When `dml.runtime.cancel(index_id)` is invoked without an active runtime execution context, the workflow SHALL still proceed as an out-of-band cancellation operation. In that case, the runtime SHALL record `cancellation_requested_by` from the configured user identity. + +#### Scenario: User-triggered cancel records configured user without active execution +- **WHEN** a user directly invokes `dml.runtime.cancel("idx1")` +- **AND** there is no active caller `execution_id` +- **THEN** the runtime SHALL set `cancellation_requested_by` to `config.user` + +#### Scenario: Missing configured user still fails cancel +- **WHEN** a user invokes `dml.runtime.cancel("idx1")` +- **AND** no configured user identity is available +- **THEN** the runtime SHALL fail the request rather than persisting an empty cancellation requester + +### Requirement: Runtime cancellation SHALL be out-of-band control-plane behavior +`dml.runtime.cancel(index_id)` SHALL operate as an out-of-band control-plane workflow rather than as a continuation of a running execution. The workflow SHALL freeze the target index, remove caller-owned live edges, orphan eligible callees, and request detached cancellation without requiring an active caller execution context. + +#### Scenario: Direct cancel freezes index before cancellation traversal +- **WHEN** a user invokes `dml.runtime.cancel("idx1")` +- **THEN** the runtime SHALL freeze the index before removing live caller edges or requesting callee cancellation diff --git a/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/tasks.md b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/tasks.md new file mode 100644 index 0000000..53fee0c --- /dev/null +++ b/openspec/changes/archive/2026-05-17-refactor-runtime-cancellation-ownership/tasks.md @@ -0,0 +1,21 @@ +## 1. Persistence Model Refactor + +- [x] 1.1 Split the current execution persistence helpers into caller-owned `launch_state` and runtime-owned `execution_record` read/write paths. +- [x] 1.2 Update cache-key lock and `active/` handling so launch/resume flows use `launch_state` and orphan-triggered cancellation clears the active pointer before lifecycle cancellation updates. +- [x] 1.3 Rename lifecycle fields and values across runtime types and validation logic to use `lifecycle`, `spawned_execution_ids`, `cancellation_requested_by`, `cancel-pending`, and `cancel-detached`. + +## 2. Runtime and Cancellation Flow + +- [x] 2.1 Refactor `IndexOps.start_fn` so caller-owned launch/resume state and runtime-owned lifecycle state are updated through their new ownership boundaries. +- [x] 2.2 Refactor cancellation planning and execution so live caller edges are caller-owned, orphan detection uses those live edges, and detached cancellation uses CAS on `execution_record` plus cancellation tombstones. +- [x] 2.3 Update `dml.runtime.cancel(index_id)` to run as an out-of-band workflow that records `config.user` when no active caller execution context exists and raises `CancelledExecutionError` on cancellation interruption. + +## 3. Executors and Adapter Contract + +- [x] 3.1 Update adapter envelope/result validation and executor dispatch to use the renamed cancellation lifecycle values while preserving the envelope field names required by the contract. +- [x] 3.2 Update built-in executors to treat `cancel-pending` as the cancellation update signal and return `cancel-detached` after successful fire-and-forget cancellation handling. + +## 4. Contract Coverage and Documentation + +- [x] 4.1 Update runtime, index-ops, and executor contract tests for the split persistence model, ETag drift behavior, direct user cancellation requester behavior, and detached cancellation semantics. +- [x] 4.2 Update the runtime and executor documentation to reflect `launch_state`, `execution_record`, live caller edge ownership, spawned execution summaries, and the accepted best-effort cancellation limitation through terminal intermediates. diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/.openspec.yaml b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/.openspec.yaml new file mode 100644 index 0000000..66da1ae --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-17 diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/design.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/design.md new file mode 100644 index 0000000..9ddd078 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/design.md @@ -0,0 +1,95 @@ +## Context + +`daggerml._internal.dml` currently orchestrates lower-level ops through two internal abstractions: the `DmlOps` facade in `daggerml._internal.ops` and the `_OpsProxy` string-dispatch layer in `daggerml._internal.dml`. The surviving `Dml` surface is already the caller-facing orchestration boundary, so these extra layers mostly wrap DB open/close, concrete ops construction, and remote configuration expansion. + +The change must preserve the existing `Dml` and namespace API exactly. The simplification is intentionally internal-only: remove indirection, reduce code size, and make module-level helper functions in `daggerml._internal.dml` construct the owning ops classes directly. + +## Goals / Non-Goals + +**Goals:** + +- Remove `DmlOps` as an internal repository/session facade. +- Remove `_OpsProxy`, string-based factory dispatch, and helper layers that exist only to reach concrete ops classes. +- Keep `Dml` as the sole caller-facing orchestration boundary without adding any new public methods, properties, or namespaces. +- Preserve explicit `remote.root` threading for remote-aware helpers and ops classes. +- Update specs, tests, and docs so they describe the simplified construction path rather than the removed facade. + +**Non-Goals:** + +- No new `Dml` APIs or namespace reshaping. +- No compatibility shims, alias exports, or transitional wrappers for `DmlOps`. +- No change to commit, DAG, runtime, cache, GC, or remote business semantics beyond how their ops instances are constructed. +- No redesign of the lower-level ops class public methods. + +## Decisions + +### Decision: `Dml` helper functions will construct concrete ops classes directly + +`daggerml._internal.dml` will own the DB lifecycle helpers and the module-level functions that instantiate `HeadOps`, `CommitOps`, `DagOps`, `NodeOps`, `IndexOps`, `CacheOps`, `GcOps`, and `RemoteOps` directly. + +Rationale: + +- This removes both the facade layer and the string-dispatch layer. +- The resulting code matches the actual subsystem ownership documented in the repo: `Dml` orchestrates, concrete ops implement behavior. +- It keeps internal construction readable at the call site instead of hiding it behind factory names. + +Alternatives considered: + +- Remove `DmlOps` but keep `_OpsProxy`: rejected because it preserves string dispatch and does not meaningfully simplify the orchestration path. +- Replace `DmlOps` with another lightweight facade: rejected because it renames the same abstraction cost instead of deleting it. + +### Decision: The `Dml` public and namespaced surface remains frozen + +This change will not add methods or properties to `Dml` or any of its namespaces. All simplification happens inside existing module-level helpers and namespace implementations. + +Rationale: + +- The goal is simplification, not surface expansion. +- Existing callers already have the orchestration boundary they need. +- Preserving the current surface keeps the change narrowly focused on internal construction. + +Alternatives considered: + +- Add new convenience methods for direct ops access: rejected as counter to the stated scope and unnecessary once helper construction is simplified. + +### Decision: Remote-aware construction continues to take explicit resolved `remote.root` + +Helpers that instantiate `IndexOps`, `CacheOps`, `RemoteOps`, and other remote-aware components will continue to pass normalized `remote.root` explicitly. The removal of `DmlOps` will not reintroduce implicit config lookups at lower layers. + +Rationale: + +- It preserves the existing explicit-configuration contract. +- It keeps remote-aware behavior consistent with the existing specs. +- It avoids sliding back into environment-driven construction hidden inside lower-level components. + +Alternatives considered: + +- Let lower-level ops resolve config themselves: rejected because it weakens the current boundary and increases hidden coupling. + +### Decision: Spec and documentation language will stop naming `DmlOps` as an active boundary + +Any specs or docs that currently describe `DmlOps` as the surviving internal orchestration boundary will be rewritten to point at the shared `Dml` workflow and direct helper-based ops construction. + +Rationale: + +- The artifacts should describe the architecture that remains after the simplification. +- Keeping `DmlOps` in the docs after deleting it would preserve conceptual dead code. + +## Risks / Trade-offs + +- Removing `DmlOps` also removes a single place that bundled DB lifecycle and ops factories. → Keep DB open/create helpers explicit in `daggerml._internal.dml` so construction remains centralized without reintroducing a facade. +- Direct construction can expose duplicated remote parsing or config-expansion logic that the facade previously hid. → Consolidate the construction path around shared helper functions in `daggerml._internal.dml` and update tests around remote-aware helper behavior. +- Existing specs and tests may still import or name `DmlOps`. → Update OpenSpec deltas, docs, and contract tests in the same change so the architecture and verification story stay aligned. + +## Migration Plan + +1. Update OpenSpec artifacts to define `Dml` as the surviving orchestration boundary and remove `DmlOps` language. +2. Delete `DmlOps` and `_OpsProxy`-style helper layers. +3. Rewrite `daggerml._internal.dml` helper construction around direct concrete ops instantiation and explicit DB lifecycle helpers. +4. Update tests and docs to target the direct-construction model. + +Rollback strategy: revert the change before release. No compatibility layer or persistent data migration is planned. + +## Open Questions + +None. diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/proposal.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/proposal.md new file mode 100644 index 0000000..05af1e5 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/proposal.md @@ -0,0 +1,30 @@ +## Why + +`daggerml._internal.dml` currently reaches the concrete ops classes through two layers of indirection: the `DmlOps` facade and the `_OpsProxy` string-dispatch helper. Those abstractions no longer carry their own value and now add code size, duplicate remote-construction logic, and obscure the actual orchestration boundary. + +## What Changes + +- Remove the internal `DmlOps` facade and stop treating it as the repository/session boundary. +- Remove `_OpsProxy`, `call_ops_method`, and other string-based dispatch helpers from `daggerml._internal.dml`. +- Have the module-level helper functions in `daggerml._internal.dml` open the DB and instantiate the concrete ops classes directly. +- Preserve the existing public `Dml` and namespace surface exactly as-is; this change does not add new `Dml` methods, properties, or namespaces. +- **BREAKING** Remove internal backward-compatibility import paths and docs that describe `DmlOps` as a supported internal facade. + +## Capabilities + +### New Capabilities + +None. + +### Modified Capabilities + +- `unified-dml-surface`: tighten the shared `Dml` orchestration contract so helper logic constructs concrete ops classes directly without adding new caller-facing surface. +- `shared-internal-configuration`: update bootstrap/config-resolution requirements to refer to the shared `Dml` workflow and module-level helper construction rather than `DmlOps`. +- `required-remote-config`: preserve explicit `remote.root` threading while removing the `DmlOps` helper boundary. +- `thin-cli-routing`: replace stale `DmlOps` wording so CLI routing requirements point at the surviving shared orchestration boundary. + +## Impact + +- Affected code: `src/daggerml/_internal/dml.py`, `src/daggerml/_internal/ops/__init__.py`, and tests/docs that import or describe `DmlOps`. +- Affected contracts: OpenSpec capabilities listed above and docs under `docs/internal/ops/` and `docs/default-dml-runtime.md`. +- APIs: no caller-facing `Dml` API expansion; internal-only breaking removal of `DmlOps` and proxy-based ops construction. diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/required-remote-config/spec.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/required-remote-config/spec.md new file mode 100644 index 0000000..9245dbb --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/required-remote-config/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST NOT model remote configuration as optional, MUST NOT provide `None` defaults for required remote parameters, and MUST receive normalized `remote.root` configuration from the shared internal configuration resolver rather than reading raw environment variables or project config files themselves. + +#### Scenario: Remote-aware ops constructor requires remote URI +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete normalized remote URI argument rather than an optional remote parameter + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +#### Scenario: Remote-aware component does not resolve env vars directly +- **WHEN** a remote-aware runtime or ops component is used in a remote-backed flow +- **THEN** it receives already-resolved remote configuration from its caller instead of inspecting `DML_REMOTE`, older remote env-var forms, or project config files directly + +#### Scenario: Init fails when required remote URI cannot resolve validly +- **WHEN** the shared `Dml` init/bootstrap workflow requires remote-backed bootstrap behavior and shared config resolution does not produce a valid `remote.root` +- **THEN** init fails with a configuration error instead of proceeding with unresolved or implicit remote configuration diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..1682668 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/shared-internal-configuration/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Multiple config sources normalize into the shared internal model +The system SHALL treat explicit arguments, environment variables, project-local config, and global config as sources that feed the shared internal configuration model. Source-specific loading may differ, but normalization and precedence MUST be centralized in the shared internal resolver. + +#### Scenario: Project-local and global config feed shared resolution +- **WHEN** a frontend resolves configuration for an operation in a project directory +- **THEN** project-local `.dml/config.toml` and any applicable global config inputs are loaded as sources for the same shared internal resolution path + +#### Scenario: Environment values are normalized centrally +- **WHEN** configuration is resolved from environment variables +- **THEN** the shared internal resolver, not the frontend, maps those values into the canonical internal configuration model + +#### Scenario: Init project layout creation delegates to shared internal helper +- **WHEN** the shared `Dml` init/bootstrap workflow must create missing project layout artifacts for a local project +- **THEN** it delegates filesystem bootstrap work to shared internal project-layout helper logic instead of duplicating directory and config-file writes across orchestration helpers + +#### Scenario: Init resolves explicit options through shared resolver +- **WHEN** a caller provides init-time options for project/runtime configuration +- **THEN** the shared `Dml` init/bootstrap workflow resolves them through the shared internal resolver before mutating project state diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/thin-cli-routing/spec.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/thin-cli-routing/spec.md new file mode 100644 index 0000000..623b889 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/thin-cli-routing/spec.md @@ -0,0 +1,23 @@ +## MODIFIED Requirements + +### Requirement: CLI project commands delegate to a single Dml workflow method +The `dml` CLI project command handlers SHALL remain thin adapters that parse command arguments and invoke exactly one workflow entrypoint per command path. + +#### Scenario: Fetch delegates through Dml +- **WHEN** a user runs `dml fetch [branch]` +- **THEN** the CLI handler parses inputs and calls one shared `Dml` fetch workflow method that performs remote synchronization behavior + +#### Scenario: Checkout delegates through Dml +- **WHEN** a user runs `dml checkout ` +- **THEN** the CLI handler parses the revision and calls one shared `Dml` checkout workflow method that returns attached/detached result details + +#### Scenario: Merge delegates through Dml +- **WHEN** a user runs `dml merge --head --user ` +- **THEN** the CLI handler calls one shared `Dml` merge workflow method and does not instantiate commit or remote ops directly + +### Requirement: CLI does not own git-like project business logic +The `_cli` layer SHALL NOT contain git-like project orchestration logic that coordinates repository state, commit resolution, or remote protocol execution. + +#### Scenario: Project logic relocation +- **WHEN** git-like project command behavior requires cross-subsystem coordination +- **THEN** the implementation resides in the shared `Dml` workflow layer and the internal ops it invokes, while CLI code remains argument parsing and result forwarding only diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..ae98fc4 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/specs/unified-dml-surface/spec.md @@ -0,0 +1,21 @@ +## MODIFIED Requirements + +### Requirement: `Dml` delegates repository behavior to the relevant ops classes +The shared `Dml` class SHALL orchestrate workflows by delegating repository actions to the relevant subsystem ops classes rather than re-implementing those mechanics inline. Module-level helper functions in `daggerml._internal.dml` SHALL construct the owning concrete ops classes directly and SHALL NOT route calls through a facade object or string-dispatch proxy layer. + +#### Scenario: Commit-oriented workflow delegates to CommitOps +- **WHEN** a caller invokes `dml.show`, `dml.log`, `dml.diff`, `dml.merge`, or `dml.revert` +- **THEN** `Dml` delegates the relevant repository operations to `CommitOps` after preparing resolved inputs + +#### Scenario: Runtime workflow delegates to IndexOps +- **WHEN** a caller invokes `dml.runtime.create`, `dml.runtime.put_literal`, `dml.runtime.start_fn`, or `dml.runtime.commit` +- **THEN** `Dml` delegates the relevant repository operations to `IndexOps` after preparing resolved inputs + +#### Scenario: Admin workflow delegates to the owning subsystem +- **WHEN** a caller invokes an admin cache, remote, or gc workflow +- **THEN** `Dml` delegates the repository action to `CacheOps`, `RemoteOps`, or `GcOps` respectively after preparing resolved inputs + +#### Scenario: Helper construction instantiates concrete ops directly +- **WHEN** a shared `Dml` workflow needs an ops object such as `CommitOps`, `HeadOps`, `IndexOps`, or `RemoteOps` +- **THEN** the helper logic in `daggerml._internal.dml` constructs that concrete ops class directly against the active DB handle +- **AND** it does not dispatch through a `DmlOps` facade or `_OpsProxy`-style string factory diff --git a/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/tasks.md b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/tasks.md new file mode 100644 index 0000000..c34f933 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-simplify-dml-ops-construction/tasks.md @@ -0,0 +1,20 @@ +## 1. Remove indirection layers + +- [x] 1.1 Delete the `DmlOps` facade from `src/daggerml/_internal/ops/__init__.py` and remove code that imports or constructs it. +- [x] 1.2 Delete `_OpsProxy`, `call_ops_method`, and related string-dispatch helpers from `src/daggerml/_internal/dml.py`. + +## 2. Rebuild direct ops construction + +- [x] 2.1 Rewrite the module-level ops helper functions in `src/daggerml/_internal/dml.py` so they open the DB and instantiate the owning concrete ops classes directly. +- [x] 2.2 Preserve existing remote-aware behavior by threading resolved `remote.root` and fetch-worker configuration through the direct helper construction path. +- [x] 2.3 Keep repository bootstrap behavior intact by replacing `DmlOps.create(...)` usage with direct DB/bootstrap orchestration in `daggerml._internal.dml`. + +## 3. Realign tests and docs + +- [x] 3.1 Update contract tests that import or describe `DmlOps` so they validate the direct helper-based construction path instead. +- [x] 3.2 Update docs and OpenSpec-linked prose that still describe `DmlOps` as an active internal facade or default-runtime boundary. + +## 4. Verify simplified boundary + +- [x] 4.1 Run the targeted contract and unit tests covering `daggerml._internal.dml`, bootstrap, and remote-aware helper construction. +- [x] 4.2 Confirm the surviving `Dml` and namespace surface is unchanged and no backward-compatibility shims remain. diff --git a/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/.openspec.yaml b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/.openspec.yaml new file mode 100644 index 0000000..231e3ab --- /dev/null +++ b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-18 diff --git a/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/design.md b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/design.md new file mode 100644 index 0000000..bb58435 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/design.md @@ -0,0 +1,88 @@ +## Context + +The current system already separates authoring-time adapter selection from executor dispatch, but it does not enforce that separation at the runtime boundary. Public APIs commonly accept symbolic adapter names such as `local`, codec normalization resolves those names through the adapter registry, and contrib executors typically emit concrete adapter strings such as `dml-local-adapter` or `dml-lambda-adapter`. However, `IndexOps` still contains a fallback path that treats an unresolved concrete adapter string as a hint to re-enter the adapter registry, import a Python object, and invoke its `cli()` entrypoint indirectly. + +That fallback hides the real contract. The runtime should only see concrete command-line-callable adapter identities such as `dml-local-adapter`, `dml-lambda-adapter`, `python3`, `podman-adapter`, or `/opt/acme/bin/build-adapter`. Symbolic names such as `local`, `lambda`, or plugin-defined sugar such as `gpu` belong to authoring and normalization, not to runtime execution. The only intentional non-command adapter value is `adapter == ""` for explicit builtin-function execution paths such as `get` and `concat`, where the runtime checks for builtin behavior directly instead of shelling out. + +## Goals / Non-Goals + +**Goals:** +- Make the runtime adapter boundary explicit and fail closed when a concrete adapter command is unavailable. +- Preserve short symbolic adapter names as authoring-time sugar so users do not need to spell built-in adapter commands in normal API calls. +- Keep plugin extensibility intact for both adapter sugar and executor registration under existing built-in adapters. +- Document that concrete adapter identities need not start with `dml-`; built-in adapters happen to use that prefix, but plugin adapters may resolve to any command-line-callable string or explicit path. + +**Non-Goals:** +- Introduce a new non-CLI adapter invocation mode. +- Require plugin adapters to use a `dml-` prefix. +- Change builtin execution to route through adapters when the runtime already handles builtin functions directly. +- Redesign executor dispatch semantics beyond clarifying the adapter-resolution boundary. + +## Decisions + +### 1. Runtime execution only accepts concrete adapter commands + +The runtime boundary will treat `Runnable.adapter` as an operational command string, not as a symbolic registry key. By the time a runnable reaches `IndexOps` adapter execution, its adapter value must already be directly callable from the command line or be an explicit filesystem path. + +Rationale: +- This makes runtime behavior deterministic across environments. +- It prevents a single concrete adapter string from taking different execution paths depending on whether a Python import fallback happens to succeed. +- It matches the intended transport boundary: adapters are CLI programs that own their own stdin/stdout contract. + +Alternatives considered: +- Keep the Python import fallback: rejected because it silently repairs invalid runtime state and makes execution mechanism depend on environment accidents. +- Let `IndexOps` resolve symbolic adapter names at runtime: rejected because it leaks authoring sugar into the execution boundary. + +### 2. Symbolic adapter names remain sugar resolved upstream + +Author-facing APIs may continue to accept sugar such as `local`, `lambda`, or plugin-defined symbolic names. That sugar must resolve before runtime execution, through the adapter registry and normal runnable-resolution flow. + +Examples: +- `local` -> `dml-local-adapter` +- `lambda` -> `dml-lambda-adapter` +- `gpu` -> `podman-adapter` +- `acme` -> `/opt/acme/bin/acme-adapter` + +Rationale: +- This preserves ergonomics while keeping runtime semantics strict. +- It provides a clear extension point for plugin-defined sugar without changing the execution model. + +Alternatives considered: +- Require users to always provide full adapter commands: rejected because it makes normal built-in usage unnecessarily verbose. + +### 3. Builtin execution keeps its explicit empty-adapter exception + +The only adapter value that may reach runtime without being command-line-callable is `""`, and only for explicit builtin-function execution paths where the runtime checks for builtin behavior directly and does not shell out. + +Rationale: +- Builtins are not external adapters and already follow a separate runtime branch. +- Keeping this exception explicit prevents future confusion about whether empty adapter strings are generally allowed. + +Alternatives considered: +- Represent builtins with a special command name: rejected because the runtime already has explicit builtin handling and does not need a fake adapter executable. + +### 4. Missing concrete adapter commands are installation or configuration errors + +If a resolved runnable names `dml-local-adapter`, `python3`, `podman-adapter`, or any other concrete command and that command is not callable from the runtime environment, execution must fail immediately rather than falling back to import-based recovery. + +Rationale: +- This exposes packaging and environment problems where they actually exist. +- It keeps tests honest: built-in adapter scripts declared in `pyproject.toml` must be installed and available. + +Alternatives considered: +- Allow fallback only for built-in adapters: rejected because it still weakens the invariant and creates a privileged special case for one packaging mode. + +### 5. Adapter test fixtures must obey the same executability contract + +Test fixtures that are referenced as adapters at runtime must themselves be executable command-line programs or explicit executable paths. For example, `tests/assets/internal_fn/python-fork-adapter.py` must carry an executable bit when tests pass its filesystem path as `Runnable.adapter`. + +Rationale: +- This keeps tests aligned with the production runtime contract instead of relying on test-only behavior. +- It ensures path-based adapter coverage exercises the same `shutil.which(...)` or explicit-path execution path used in production. + +## Risks / Trade-offs + +- Missing adapter scripts in local dev or test environments will fail sooner -> Mitigation: treat this as desired signal and update tests/tooling to require installed console scripts. +- Some tests and helpers may still construct raw runnables with symbolic adapter names -> Mitigation: update those fixtures to reflect the runtime invariant and keep sugar only at authoring-time APIs. +- Plugin authors may assume importable adapter specs are enough -> Mitigation: document that plugin sugar must resolve to a concrete CLI adapter command or explicit executable path. +- The builtin empty-string exception could be overgeneralized later -> Mitigation: keep the exception narrow and document that it only applies to explicit builtin runtime branches. diff --git a/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/proposal.md b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/proposal.md new file mode 100644 index 0000000..e54d967 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/proposal.md @@ -0,0 +1,29 @@ +## Why + +Adapter execution currently tolerates a runtime fallback that imports Python adapter objects when a concrete adapter command is missing from `PATH`. That blurs the boundary between adapter-name sugar and concrete runtime execution, and it allows invalid resolved runnables to keep working instead of failing as installation or configuration errors. + +## What Changes + +- Define a strict runtime invariant: any runnable that reaches adapter execution MUST carry a command-line-callable adapter string or an explicit filesystem path. +- Preserve symbolic adapter names such as `local` and `lambda` only as authoring-time sugar that is resolved before runtime execution. +- Treat built-in adapters `dml-local-adapter` and `dml-lambda-adapter` as canonical concrete adapter identities. +- Support plugin-defined sugar that resolves to any concrete callable adapter string, including names that do not start with `dml-` and explicit executable paths. +- Reserve `adapter == ""` only for explicit builtin-function execution paths that never shell out to an adapter. +- **BREAKING** Remove runtime fallback behavior that imports adapter specs and invokes `cli()` when the concrete adapter command is not present on `PATH`. + +## Capabilities + +### New Capabilities + +- `adapter-cli-resolution`: defines how symbolic adapter names resolve to concrete command-line adapter identities and how runtime execution handles builtin exceptions and missing commands. + +### Modified Capabilities + +None. + +## Impact + +- Affected code: delayed runnable normalization, adapter registry usage, `IndexOps` adapter invocation, contrib executor runnable construction, and adapter-path test helpers. +- Affected runtime behavior: missing concrete adapter commands become hard failures instead of falling back to Python import-based execution. +- Affected plugin contracts: plugin adapters may continue to provide sugar, but they must resolve to a concrete command-line-callable adapter identity before runtime execution. +- Affected test fixtures: adapter-backed test helpers such as `tests/assets/internal_fn/python-fork-adapter.py` must be executable from the command line when referenced as runtime adapters, because runtime execution no longer repairs non-executable adapter references via Python import fallback. diff --git a/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/specs/adapter-cli-resolution/spec.md b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/specs/adapter-cli-resolution/spec.md new file mode 100644 index 0000000..5b953b2 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/specs/adapter-cli-resolution/spec.md @@ -0,0 +1,55 @@ +## ADDED Requirements + +### Requirement: Runtime adapter execution SHALL use only concrete command-line adapter identities +Any runnable that reaches adapter execution SHALL carry an adapter value that is directly command-line-callable from the runtime environment or is an explicit executable path. Runtime adapter execution SHALL NOT reinterpret that value as symbolic sugar, SHALL NOT consult the adapter registry to repair it, and SHALL NOT fall back to Python import-based `cli()` invocation. + +#### Scenario: Built-in local adapter executes as a concrete command +- **WHEN** runtime execution receives a runnable with `adapter = "dml-local-adapter"` +- **THEN** it invokes `dml-local-adapter` as a command-line program +- **AND** it does not re-resolve `dml-local-adapter` through the adapter registry + +#### Scenario: Plugin adapter executes without requiring a `dml-` prefix +- **WHEN** runtime execution receives a runnable with `adapter = "podman-adapter"` +- **THEN** it treats `podman-adapter` as a valid concrete adapter command if callable from the runtime environment +- **AND** it does not require the adapter string to start with `dml-` + +#### Scenario: Explicit executable path is accepted +- **WHEN** runtime execution receives a runnable with `adapter = "/opt/acme/bin/build-adapter"` +- **THEN** it invokes that path directly as the adapter command + +#### Scenario: Test adapter path must itself be executable +- **WHEN** a test or fixture passes a filesystem path such as `tests/assets/internal_fn/python-fork-adapter.py` as `runnable.adapter` +- **THEN** that file is expected to be directly executable by the runtime, including any required executable permission bits +- **AND** runtime execution does not repair a non-executable adapter path through Python import fallback + +#### Scenario: Missing concrete adapter command fails closed +- **WHEN** runtime execution receives a runnable with a concrete adapter command that is not callable from the runtime environment +- **THEN** execution fails with an adapter-not-found error +- **AND** the runtime does not attempt Python import-based recovery + +### Requirement: Symbolic adapter names SHALL resolve before runtime execution +Author-facing APIs MAY accept symbolic adapter names as sugar, but the adapter registry and runnable-resolution flow SHALL resolve that sugar to a concrete command-line adapter identity before runtime execution begins. + +#### Scenario: Built-in sugar resolves to the canonical local adapter command +- **WHEN** author-facing code specifies `adapter = "local"` +- **THEN** runnable resolution produces a runtime runnable with `adapter = "dml-local-adapter"` + +#### Scenario: Built-in sugar resolves to the canonical lambda adapter command +- **WHEN** author-facing code specifies `adapter = "lambda"` +- **THEN** runnable resolution produces a runtime runnable with `adapter = "dml-lambda-adapter"` + +#### Scenario: Plugin-defined sugar resolves to a non-`dml-` adapter command +- **WHEN** a plugin registers symbolic adapter sugar `gpu` +- **THEN** runnable resolution MAY produce a runtime runnable with `adapter = "podman-adapter"` +- **AND** runtime execution treats that resolved command as canonical for that runnable + +### Requirement: Builtin execution SHALL use an explicit empty-adapter exception +The only runtime execution path that MAY accept `adapter = ""` is the explicit builtin-function branch where the runtime detects builtin execution directly and does not shell out to an adapter process. + +#### Scenario: Builtin function bypasses adapter execution +- **WHEN** runtime execution handles a builtin function such as `get` or `concat` +- **THEN** it uses the builtin execution branch instead of spawning an adapter process + +#### Scenario: Empty adapter is not accepted for non-builtin execution +- **WHEN** a non-builtin runnable reaches adapter execution with `adapter = ""` +- **THEN** execution fails instead of treating the empty string as a command-line adapter identity diff --git a/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/tasks.md b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/tasks.md new file mode 100644 index 0000000..c83539a --- /dev/null +++ b/openspec/changes/archive/2026-05-17-strict-concrete-adapter-cli/tasks.md @@ -0,0 +1,17 @@ +## 1. Tighten Runtime Adapter Invocation + +- [x] 1.1 Remove the `IndexOps` adapter import fallback and make missing concrete adapter commands fail closed. +- [x] 1.2 Preserve the existing builtin execution branch so `adapter == ""` remains valid only for explicit builtin-function handling. +- [x] 1.3 Audit runtime adapter invocation helpers and fixtures for any remaining symbolic adapter assumptions. + +## 2. Resolve Sugar Before Runtime + +- [x] 2.1 Ensure author-facing adapter sugar such as `local`, `lambda`, and plugin-defined symbolic names resolves to concrete adapter commands before runtime execution. +- [x] 2.2 Audit contrib executors and runnable constructors so resolved runnables carry canonical concrete adapter strings or explicit executable paths. +- [x] 2.3 Reject or eliminate raw runtime runnable construction patterns that still use symbolic adapter names. + +## 3. Update Contracts And Tests + +- [x] 3.1 Update contract and integration tests to require installed adapter console scripts instead of relying on import-based fallback. +- [x] 3.2 Add or revise tests covering non-`dml-` concrete adapter commands and explicit adapter paths. +- [x] 3.3 Add or revise tests covering the explicit builtin empty-adapter exception and rejection of empty adapters for non-builtin execution. diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/.openspec.yaml b/openspec/changes/archive/2026-05-17-unify-cli-generation/.openspec.yaml new file mode 100644 index 0000000..231e3ab --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-18 diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/design.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/design.md new file mode 100644 index 0000000..9d86d91 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/design.md @@ -0,0 +1,88 @@ +## Context + +The current `dml` CLI is implemented as a package of hand-written parser modules that manually mirror a public `Dml` surface that is already strongly documented with signatures, docstrings, and `Annotated` metadata. That duplication makes it easy for the CLI to drift from `Dml`, requires repeated parser maintenance for every new public method, and leaves some public workflows unexposed. At the same time, some public `Dml` parameters exist for in-process embedding rather than for command-line use, so the generated CLI needs a filtering rule rather than a naive expose-everything rule. + +## Goals / Non-Goals + +**Goals:** +- Replace the `_cli/` package with one generated `src/daggerml/_cli.py` entrypoint. +- Build command trees from public `Dml` methods and public namespace methods. +- Generate argument parsing from runtime-visible signatures, type hints, defaults, and `Annotated` help text. +- Expose all public methods whose parameter types can be generated from the CLI. +- Keep all CLI output and normalized errors as JSON. +- Move S3 client ownership into `Dml` instances so sync commands remain CLI-exposable. + +**Non-Goals:** +- Supporting every possible Python type at the CLI boundary. +- Representing multiple overload variants as distinct CLI command grammars. +- Preserving the current help text formatting or exact command parser structure. +- Adding new domain workflows beyond those already present on the public `Dml` surface. + +## Decisions + +### Single generated CLI module +The CLI will move to one `src/daggerml/_cli.py` module that owns parser generation, dispatch, JSON serialization, error normalization, and top-level runtime override flags. + +Alternatives considered: +- Keep `_cli/*` and add generator helpers: rejected because it preserves the manual duplication problem. +- Generate code ahead of time: rejected because runtime introspection is already available and easier to keep in sync. + +### Command tree comes from public `Dml` structure +Top-level commands come from public `Dml` callables plus selected class entrypoints such as `Dml.init`. Public namespace objects such as `config`, `runtime`, `dag`, and `admin` become subcommand groups, and their public methods become leaf commands. + +Alternatives considered: +- Maintain an allowlist of command names only: rejected because it reintroduces manual CLI drift. +- Expose private helpers: rejected because the CLI contract should remain on the documented public surface. + +### Filter methods by CLI-generatable parameter types +The generator will expose only methods whose public parameters can be derived from CLI input. Supported parameter families are scalar primitives, `Ref`, `Literal`, optionals of supported types, and JSON-backed container types. Methods with unsupported parameter annotations such as `Any` will be omitted entirely rather than partially exposed. + +Alternatives considered: +- Expose unsupported parameters as raw strings: rejected because it weakens type-driven parsing and invites ambiguous behavior. +- Expose a method while silently dropping unsupported parameters: rejected because it changes public method semantics invisibly. + +### Use one runtime-visible signature when overloads are ambiguous +CLI generation will inspect the runtime-visible implementation signature and its resolved annotations. If overloads describe richer variants than the implementation signature can express directly, the generator will pick that one runtime signature and proceed. + +Alternatives considered: +- Encode each overload as a separate CLI grammar: rejected as useful but out of scope. +- Fail generation when overloads exist: rejected because many current public methods already use overloads for return typing only. + +### Argument and help generation rules are mechanical +Required parameters become positional arguments. Defaulted parameters become options. Boolean defaults preserve current behavior through positive flags for `False` defaults and `--no-...` flags for `True` defaults. Positional names remain snake case, option names become kebab case, and positional argument documentation is included in the parser description/help text because `argparse` does not present it well by default. + +Alternatives considered: +- Encode all inputs as options only: rejected because it obscures method signatures. +- Normalize positional names to kebab case: rejected because snake case maps more directly to parameter names. + +### JSON is the only CLI output format +All command results and normalized errors will be emitted as JSON, using the existing typed-leaf serialization rules for `Ref`, `Uri`, and related objects. + +Alternatives considered: +- Preserve special plain-text commands such as `config get`: rejected by scope decision. + +### `Dml` owns its S3 client +`Dml` will initialize `self._s3_client` during construction and remote sync methods will use that stored client instead of public `s3_client` parameters. This keeps `push`, `pull`, and `fetch` publicly callable from the generated CLI without forcing unsupported parameter filtering on those methods. + +Alternatives considered: +- Keep `s3_client` in the signature and special-case it in CLI generation: rejected because it pollutes the public surface with a non-CLI concern. +- Instantiate a new client inside every sync method call: rejected because the current surface already allows shared client reuse and the proposal explicitly prefers instance ownership. + +## Risks / Trade-offs + +- Broad CLI grammar change -> Mitigation: document this as a breaking CLI redesign and cover representative commands in CLI tests. +- Signature-driven generation may expose awkward public method names directly -> Mitigation: treat the public `Dml` surface as the canonical CLI contract for this redesign. +- Filtering unsupported methods may hide workflows users expect -> Mitigation: document the filtering rule in specs and keep the supported type set explicit. +- Adding `_s3_client` changes an existing `Dml` private-state constraint -> Mitigation: update the `unified-dml-surface` capability to make the new private field explicit. +- Runtime-visible overload selection may miss future richer variants -> Mitigation: document the limitation now and leave multi-overload CLI support as future work. + +## Migration Plan + +- Land the new CLI generator and update the `dml` entrypoint to import `daggerml._cli:cli` from the new module path. +- Remove `_cli/*` package modules after equivalent generated coverage exists. +- Update CLI docs and tests to the new generated grammar and JSON-only output. +- Update `Dml` sync methods and their callers to use `self._s3_client`. + +## Open Questions + +- None for this proposal; the supported-type filter, overload fallback rule, and `_s3_client` ownership model are intentionally fixed by scope. diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/proposal.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/proposal.md new file mode 100644 index 0000000..f10e6ab --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/proposal.md @@ -0,0 +1,30 @@ +## Why + +The current CLI is spread across many `_cli/*` modules that manually restate the public `Dml` surface. That duplication makes the CLI harder to evolve, leaves some public `Dml` workflows unavailable from the CLI, and weakens the value of the existing signature/docstring/annotation work on `Dml`. + +## What Changes + +- **BREAKING** replace the `_cli/` command package with a single generated `src/daggerml/_cli.py` entrypoint. +- Generate the CLI from the public `Dml` class and its public namespaces, using signatures, docstrings, and `Annotated` metadata for command structure and help text. +- Expose all public CLI-generatable `Dml` workflows, including runtime/admin/dag/config methods, instead of a curated subset. +- Standardize generated argument rules: required parameters become positional arguments, defaulted parameters become options, boolean defaults preserve current behavior through `--flag` or `--no-flag`, and option names use kebab case. +- Standardize CLI I/O around JSON serialization and structured JSON errors for all commands. +- Remove injectable `s3_client` parameters from public `Dml` sync method signatures, initialize `Dml._s3_client` during construction, and route sync workflows through that stored client instead. +- Document that overload ambiguity is out of scope for this change: the generator will use one runtime-visible signature when multiple overloads exist. +- Document and enforce that methods with non-CLI-generatable parameter types such as `Any` are omitted from generated CLI exposure. + +## Capabilities + +### New Capabilities +- `generated-dml-cli`: Generate the CLI directly from the public `Dml` surface, including type-driven argument parsing, help generation, method filtering, and JSON serialization rules. + +### Modified Capabilities +- `unified-dml-surface`: Adjust the shared `Dml` surface contract so sync workflows use an instance-owned S3 client rather than public `s3_client` parameters, while remaining introspection-ready for CLI generation. +- `shared-internal-configuration`: Remove the documented assumption that some public `Dml` workflows stay CLI-unavailable due to serialization limits when their public parameter types are CLI-generatable. +- `cli-thin-interface`: Redefine the CLI transport layer around one generated entrypoint that exposes the public CLI-generatable `Dml` surface while remaining transport-only. + +## Impact + +- Affected code: `src/daggerml/_cli/**`, `src/daggerml/_internal/dml.py`, CLI tests, packaging entrypoints, and CLI docs/specs. +- Public interface: the `dml` CLI grammar and help output change broadly; all commands return JSON. +- Runtime behavior: remote sync workflows continue to work after moving S3 client ownership into `Dml` instances. diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/cli-thin-interface/spec.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/cli-thin-interface/spec.md new file mode 100644 index 0000000..9b368d6 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/cli-thin-interface/spec.md @@ -0,0 +1,32 @@ +## MODIFIED Requirements + +### Requirement: CLI handlers are transport-only +The CLI command layer SHALL be limited to discovering command shape from the public `Dml` surface, parsing command inputs, invoking domain interfaces, and serializing outputs, and SHALL NOT contain business workflow or domain decision logic. + +#### Scenario: CLI parses and delegates +- **WHEN** a user invokes any CLI command +- **THEN** the handler parses flags and arguments, calls a domain entrypoint, and formats the returned result without domain branching in the CLI layer + +#### Scenario: Generated command discovery remains transport-only +- **WHEN** the CLI inspects `Dml` signatures, annotations, and docstrings to build commands +- **THEN** that inspection is used only to derive transport behavior and not to re-implement domain workflow rules in the CLI layer + +## ADDED Requirements + +### Requirement: One generated CLI module owns the public transport surface +The `dml` CLI SHALL be implemented through a single generated transport module rather than a package of hand-maintained per-command parser modules. + +#### Scenario: Public CLI entrypoint resolves through one module +- **WHEN** the `dml` script entrypoint is loaded +- **THEN** it imports one CLI module that generates and dispatches the public command surface + +### Requirement: Generated CLI command exposure follows the public `Dml` surface +The CLI SHALL expose the public CLI-generatable `Dml` surface directly rather than maintaining a smaller curated command subset. + +#### Scenario: Runtime workflows become CLI-visible when generatable +- **WHEN** a public `dml.runtime` method uses only CLI-generatable parameter types +- **THEN** the generated CLI exposes that runtime workflow as a command + +#### Scenario: JSON output is uniform across generated commands +- **WHEN** any generated CLI command succeeds or fails +- **THEN** the CLI emits JSON rather than mixing JSON and plain-text command modes diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/generated-dml-cli/spec.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/generated-dml-cli/spec.md new file mode 100644 index 0000000..b2cc646 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/generated-dml-cli/spec.md @@ -0,0 +1,73 @@ +## ADDED Requirements + +### Requirement: CLI surface is generated from the public `Dml` API +The system SHALL generate the `dml` command tree from the public `Dml` class and its public namespaces rather than from a hand-maintained set of per-command parser modules. + +#### Scenario: Top-level public methods become commands +- **WHEN** a public callable exists on `Dml` and its parameters are CLI-generatable +- **THEN** the CLI exposes a top-level command for that method + +#### Scenario: Public namespaces become command groups +- **WHEN** a public namespace object is reachable from `Dml` +- **THEN** the CLI exposes that namespace as a subcommand group and exposes its public CLI-generatable methods as leaf commands + +### Requirement: CLI only exposes methods with generatable parameter types +The CLI SHALL omit any public `Dml` or namespace method whose parameter annotations cannot be generated from command-line input. + +#### Scenario: Unsupported parameter type omits method +- **WHEN** a public method includes a parameter annotated with an unsupported type such as `Any` +- **THEN** the CLI does not expose that method + +#### Scenario: Supported typed method remains exposed +- **WHEN** a public method uses only supported parameter families such as `Ref`, `int`, `float`, `str`, `Literal`, optionals of those types, or JSON-backed container types +- **THEN** the CLI exposes that method + +### Requirement: Generated arguments follow signature-driven CLI rules +The CLI SHALL derive argument shape from runtime-visible signatures, defaults, and resolved annotations. + +#### Scenario: Required parameters become positional arguments +- **WHEN** a public method parameter has no default value +- **THEN** the generated CLI exposes it as a positional argument using the snake_case parameter name + +#### Scenario: Defaulted parameters become options +- **WHEN** a public method parameter has a default value +- **THEN** the generated CLI exposes it as an option using the kebab-case parameter name + +#### Scenario: Boolean defaults preserve behavior +- **WHEN** a boolean parameter default is `False` +- **THEN** the generated CLI exposes a positive `--` flag +- **AND** when a boolean parameter default is `True` +- **THEN** the generated CLI exposes a negative `--no-` flag + +### Requirement: Generated parsing uses annotations and documented help metadata +The CLI SHALL parse supported argument types from resolved annotations and SHALL use docstrings plus `Annotated` metadata to generate command help. + +#### Scenario: Literal annotations constrain choices +- **WHEN** a parameter is annotated with `Literal[...]` +- **THEN** the generated CLI restricts accepted values to those literals + +#### Scenario: Ref annotations parse as refs +- **WHEN** a parameter is annotated as `Ref` +- **THEN** the generated CLI parses the input string into a `Ref` value before calling the method + +#### Scenario: Positional arguments are documented in help text +- **WHEN** a generated command includes positional arguments +- **THEN** the command help includes positional argument documentation derived from annotations or doc metadata rather than relying only on default `argparse` positional rendering + +### Requirement: Overload ambiguity uses one runtime signature +The CLI SHALL generate commands from one runtime-visible signature even when overload declarations describe multiple static variants. + +#### Scenario: Overloaded method still generates one command +- **WHEN** a public method has overload declarations and one implementation signature +- **THEN** the CLI uses the implementation signature for generation and does not create multiple command variants + +### Requirement: Generated CLI output and errors are JSON +The generated CLI SHALL emit JSON for successful results and normalized failures. + +#### Scenario: Successful command emits JSON +- **WHEN** a generated CLI command returns a value +- **THEN** the CLI serializes that value as JSON using the standard typed-leaf encoder + +#### Scenario: Failed command emits structured JSON error +- **WHEN** generated command execution raises an exception +- **THEN** the CLI emits a structured JSON error payload instead of an unstructured traceback diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/shared-internal-configuration/spec.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..042ebb2 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/shared-internal-configuration/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: CLI limitations caused by serialization are documented, not treated as config divergence +The system SHALL document only those public `Dml` workflows that remain unavailable in the CLI because their public parameter types cannot be generated faithfully from command-line input. These omissions MUST NOT create a separate CLI-specific configuration model. + +#### Scenario: Unsupported public parameter types remain API-only +- **WHEN** a public workflow exposes parameter types that the CLI generator cannot represent cleanly +- **THEN** the documentation identifies that workflow as unavailable in the CLI while preserving the shared internal configuration model for supported operations + +#### Scenario: CLI-generatable public workflows are not excluded for historical reasons +- **WHEN** a public workflow uses only CLI-generatable parameter types +- **THEN** the CLI exposes that workflow instead of treating it as API-only based on prior manual CLI limitations + +#### Scenario: Missing CLI feature does not imply different config rules +- **WHEN** a capability is supported by both API and CLI +- **THEN** both frontends use the same shared internal configuration rules for that capability diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..a1b4a25 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/specs/unified-dml-surface/spec.md @@ -0,0 +1,56 @@ +## MODIFIED Requirements + +### Requirement: Shared `Dml` surface SHALL be introspection-ready +The shared `Dml` boundary and its public namespaces SHALL expose runtime documentation that explains class purpose, method behavior, and parameter meaning without changing workflow semantics, and that metadata SHALL be sufficient for generated CLI help. + +#### Scenario: Namespace objects describe their purpose +- **WHEN** a caller inspects `Dml` or any namespace reachable through `dml.config`, `dml.runtime`, `dml.dag`, or `dml.admin` +- **THEN** the class exposes a docstring that describes the purpose of that boundary or namespace + +#### Scenario: Public methods describe behavior +- **WHEN** a caller inspects a public top-level or namespaced `Dml` method +- **THEN** the method exposes a docstring that describes the operation behavior and any notable constraints or side effects + +#### Scenario: Generated CLI help can use runtime docs +- **WHEN** the CLI generator inspects `Dml` or one of its public namespace methods +- **THEN** it can derive command descriptions and parameter help from runtime docstrings and annotation metadata without a separate command-specific help registry + +### Requirement: Shared `Dml` parameters SHALL expose machine-readable help metadata +Public parameters on the shared `Dml` surface and its public namespace methods SHALL use `typing.Annotated` metadata to describe parameter meaning, while Python signature defaults remain the source of truth for default values. + +#### Scenario: Parameter meaning is available from annotations +- **WHEN** a caller inspects annotations for a public `Dml` method or a public method on a `Dml` namespace object with extras included +- **THEN** the parameter annotations include `Annotated` metadata that describes what each user-facing parameter means + +#### Scenario: Defaults remain in the signature +- **WHEN** a public `Dml` or namespaced method has a defaulted parameter +- **THEN** the default value remains represented by the Python signature +- **AND** the `Annotated` metadata does not become the source of truth for that default + +#### Scenario: Ambiguous selector parameters may include examples +- **WHEN** a public `Dml` parameter accepts potentially confusing selector or URI forms such as revision selectors or remote project identifiers +- **THEN** the `Annotated` metadata MAY include concise examples that clarify accepted forms without redefining the underlying grammar + +#### Scenario: Non-generatable CLI parameters are not part of the public method surface +- **WHEN** a public workflow depends on helper state that cannot be generated from CLI input such as an S3 client object +- **THEN** that helper state is provided through `Dml` instance construction or private instance state rather than through a public method parameter + +### Requirement: `Dml` stores runtime context, S3 client state, and temporary-directory bookkeeping +The shared `Dml` class SHALL keep only `_context`, `_s3_client`, and `_tempdirs` as private instance attributes. Helper behavior that supports `Dml` public methods SHALL live in module-level functions within `daggerml._internal.dml` rather than in private `Dml` instance methods. + +#### Scenario: Namespace and helper access do not require extra Dml instance fields +- **WHEN** a caller uses any public namespace on `Dml` +- **THEN** the namespace behavior is derived from `_context`, `_s3_client`, `_tempdirs`, and delegated helper logic without introducing additional private `Dml` instance attributes + +#### Scenario: Dml public workflows do not depend on private helper methods +- **WHEN** a `Dml` repository, runtime, DAG, admin, or config workflow needs helper behavior such as ops dispatch, payload shaping, or revision binding +- **THEN** that helper behavior executes through module-level functions in `daggerml._internal.dml` rather than through `Dml._...` instance methods + +#### Scenario: Namespace objects keep only Dml as private state +- **WHEN** a caller inspects the namespace objects exposed by `Dml` +- **THEN** each namespace object keeps only `._dml` as private instance state +- **AND** namespace helper behavior does not rely on additional private attrs or private helper methods on the namespace object + +#### Scenario: Remote sync workflows reuse the Dml-owned S3 client +- **WHEN** a caller invokes `dml.fetch`, `dml.pull`, or `dml.push` +- **THEN** the workflow uses the `Dml` instance's private `_s3_client` instead of requiring a public `s3_client` method parameter diff --git a/openspec/changes/archive/2026-05-17-unify-cli-generation/tasks.md b/openspec/changes/archive/2026-05-17-unify-cli-generation/tasks.md new file mode 100644 index 0000000..8ee07d4 --- /dev/null +++ b/openspec/changes/archive/2026-05-17-unify-cli-generation/tasks.md @@ -0,0 +1,18 @@ +## 1. Reshape the `Dml` surface for CLI generation + +- [x] 1.1 Remove public `s3_client` parameters from sync-facing `Dml` methods and initialize `Dml._s3_client` during construction. +- [x] 1.2 Update remote sync helpers and callers so `fetch`, `pull`, and `push` use the Dml-owned S3 client. +- [x] 1.3 Review public `Dml` and namespace method annotations/docstrings so generated CLI help and parsing metadata are complete and accurate. + +## 2. Build the generated CLI entrypoint + +- [x] 2.1 Replace the `_cli/` package implementation with a single `src/daggerml/_cli.py` module that owns parser generation, dispatch, logging, and JSON/error serialization. +- [x] 2.2 Implement command discovery from public `Dml` methods, public namespaces, and supported class entrypoints such as `Dml.init`. +- [x] 2.3 Implement type-driven argument generation and parsing for supported types, including boolean flag inversion rules, `Ref` parsing, JSON-backed container parsing, and positional help text rendering. +- [x] 2.4 Implement method filtering so commands are generated only for methods whose public parameter types are CLI-generatable, and document the one-runtime-signature overload rule in code comments or user-facing help where appropriate. + +## 3. Update packaging, tests, and docs + +- [x] 3.1 Update CLI packaging/import wiring and remove obsolete `_cli/*` command modules. +- [x] 3.2 Add or update CLI tests covering generated command exposure, unsupported-method filtering, JSON output/errors, and representative runtime/admin/dag/config flows. +- [x] 3.3 Update CLI documentation to describe the generated surface, JSON-only output, supported type parsing rules, and the overload limitation. diff --git a/openspec/changes/archive/2026-05-18-reorganize-project-docs/.openspec.yaml b/openspec/changes/archive/2026-05-18-reorganize-project-docs/.openspec.yaml new file mode 100644 index 0000000..28882f7 --- /dev/null +++ b/openspec/changes/archive/2026-05-18-reorganize-project-docs/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-19 diff --git a/openspec/changes/archive/2026-05-18-reorganize-project-docs/design.md b/openspec/changes/archive/2026-05-18-reorganize-project-docs/design.md new file mode 100644 index 0000000..1ece283 --- /dev/null +++ b/openspec/changes/archive/2026-05-18-reorganize-project-docs/design.md @@ -0,0 +1,217 @@ +## Context + +The current repository has strong technical documentation coverage, but most of that coverage is written as an authority-driven spec suite. `docs/README.md` is a governance index, `docs/DOC_MAP.md` is an edit workflow rulebook, `docs/spec/overview.md` is a concept-authority map, and most topic docs lead with status, scope, authority, invariants, and compatibility language. That structure is useful for disciplined maintenance, but it does not match the audience boundary we want now: `docs/` should be for humans trying to understand or use DaggerML as it exists. + +The repo also already has a natural place for agent-facing planning and change management: `openspec/`. That lets us make a cleaner distinction: + +- `docs/`: human-facing project docs +- `openspec/`: change proposals, specs, and tasks for agent-driven work +- contributor workflow docs outside `docs/`: edit pre-read rules, test taxonomy, and similar maintainer guidance + +This change is a documentation-architecture change rather than a product behavior change. The risk is not code regression; the risk is losing useful technical information, preserving the wrong audience voice, or turning the work into a path reshuffle without improving the reading experience. + +## Goals / Non-Goals + +**Goals:** +- Establish a human-facing information architecture for `docs/`. +- Keep `getting-started` as one concise page rather than a directory of tiny setup docs. +- Define what each doc lane contains: docs home, getting started, concepts, guides, reference, architecture, and contrib. +- Identify which current docs stay in `docs/` as rewritten material and which should move out because they are maintainer- or agent-facing. +- Require each implementation task to be handled by an independent subagent that first reads the repo and relevant code/docs so the rewritten docs remain grounded in reality. +- Preserve dense technical knowledge while translating it into reader-first prose. + +**Non-Goals:** +- Changing code behavior, API semantics, CLI semantics, or storage/runtime behavior. +- Rewriting every sentence in one pass without regard to existing useful content. +- Moving OpenSpec change artifacts into `docs/` or treating `docs/` as a normative spec suite. +- Creating a large multi-file onboarding section for basics that fit comfortably in one getting-started document. + +## Decisions + +### `docs/` will be organized by reader intent + +The target top-level shape will be: + +- `docs/README.md`: reader-facing docs home and navigation map +- `docs/getting-started.md`: one compact setup-and-first-success page +- `docs/concepts/`: mental models and core domain explanations +- `docs/guides/`: task-oriented walkthroughs +- `docs/reference/`: API, CLI, configuration, and error reference material +- `docs/architecture/`: system structure and internal design explanations for advanced readers and contributors +- `docs/contrib/`: a parallel subtree for contrib-specific concepts, guides, reference, and architecture + +Rationale: +- Human readers usually navigate by intent such as learning concepts, accomplishing a task, or checking a reference. +- The current source-tree-shaped and authority-shaped layout leaks maintainer concerns into the main reading path. + +Alternatives considered: +- Keep the existing file layout and only soften the language. Rejected because the structure itself still centers governance instead of reader needs. +- Mirror the source tree directly in docs. Rejected because implementation decomposition is not the best primary reading experience. + +### `getting-started` stays a single file + +`docs/getting-started.md` will be one concise page that covers: + +- what DaggerML is in a few sentences +- installation as a one-line `pip install daggerml` +- optional CLI installation note +- creating a repo +- creating a first DAG in Python +- one or two CLI inspection commands such as listing DAGs +- cleanup or next-step links + +Rationale: +- The basic setup path is short and should feel fast. +- Splitting installation, first repo, and first DAG into many files adds navigation overhead without adding conceptual value. + +Alternatives considered: +- A `getting-started/` directory with several pages. Rejected because the setup surface is too small to justify a subtree. + +### Existing project docs will be rewritten into four human-facing doc modes + +Current technical material will be translated into one of four doc modes: + +- Concepts: explain what something is and how to think about it +- Guides: explain how to accomplish a workflow +- Reference: explain exact user-facing surfaces and options +- Architecture: explain how the internals are structured and interact + +Likely placement of current material: + +- `api.md` -> `reference/python-api.md` +- `cli.md` -> `reference/cli.md` +- `configuration.md` -> `reference/configuration.md` +- `errors.md` -> `reference/errors.md` +- `object-model.md`, `dag-model.md`, `commit-model.md`, `codec-system.md`, `execution-model.md`, `remote-sync.md` -> `concepts/` +- `system.md`, `internal/README.md`, `internal/ops/*.md`, `remote-protocol.md`, deep internal storage/type docs -> `architecture/` +- `storing-and-retrieving-external-data.md` may split into a concept doc plus a guide if the current material mixes model and workflow + +Rationale: +- Most current docs contain valuable content, but their framing is wrong for the intended audience. +- Separating mode by reader question makes each page easier to write and easier to use. + +Alternatives considered: +- Preserve file names and just change headings. Rejected because many names and paths currently encode implementation ownership rather than reader purpose. + +### Maintainer governance docs will move out of `docs/` + +The following categories will no longer live in human-facing project docs: + +- edit pre-read workflow maps such as `docs/DOC_MAP.md` +- spec-governance indexes such as `docs/spec/overview.md` +- contributor policy docs such as `docs/testing-taxonomy.md` + +These should move to contributor or agent-facing homes such as `CONTRIBUTING.md`, `AGENTS.md`, `.opencode/`, or another clearly maintainer-oriented location chosen during implementation. + +Rationale: +- These documents describe how maintainers and agents work on the repo, not how DaggerML works. +- Leaving them under `docs/` blurs the audience boundary the change is trying to create. + +Alternatives considered: +- Keep them in `docs/` under a maintainer-only subdirectory. Rejected because the user explicitly wants `docs/` to be the human-facing project docs set. + +### Each doc lane should have named target pages and concrete content expectations + +The reorganization will not stop at creating directories. Each target area should have a defined purpose and likely page set. + +Proposed contents: + +- `docs/README.md` + - explain the overall docs map + - link readers to getting started, concepts, guides, reference, architecture, and contrib + - explain in one short note that `openspec/` is for change planning, not product docs + +- `docs/getting-started.md` + - installation + - create/select a repo + - create a first DAG in Python + - inspect with CLI + - pointers to next concept and reference docs + +- `docs/concepts/` + - `overview.md`: how the concepts fit together + - `dags-and-nodes.md` + - `commits-and-history.md` + - `refs-and-namespaces.md` + - `execution.md` + - `storage.md` + - `remotes.md` + - `codecs-and-values.md` + +- `docs/guides/` + - `create-and-run-a-dag.md` + - `inspect-a-repository.md` + - `work-with-remotes.md` + - `store-and-load-external-data.md` + - `troubleshoot-common-errors.md` + +- `docs/reference/` + - `python-api.md` + - `cli.md` + - `configuration.md` + - `errors.md` + +- `docs/architecture/` + - `system-overview.md` + - `internal-modules.md` + - `ops-layer.md` + - `storage-internals.md` + - `remote-protocol.md` + - `type-system.md` + +- `docs/contrib/` + - `README.md` + - `getting-started.md` or a short start section inside the README if the material is small + - `concepts/` + - `guides/` + - `reference/` + - `architecture/` + +Rationale: +- Named target pages make the work concrete and reviewable. +- Writers can preserve and reshape existing material with much less ambiguity. + +Alternatives considered: +- Leave page selection to whoever implements each subtree. Rejected because that would create inconsistent granularity and duplicated topics. + +### Implementation work will be partitioned into independent repo-aware subagents + +The task plan will assign each major docs area to a separate subagent. Each subagent must inspect the current repo before drafting docs for its area, including: + +- the current docs that cover the same topic +- the relevant source modules and entrypoints +- the root `README.md` and contributor context where relevant + +Subagents should be able to work in parallel because the new IA boundaries are intentionally separated by reader intent and subtree ownership. + +Rationale: +- Good docs require understanding the real system, not just moving prose around. +- Independent subagents reduce merge contention and allow parallel progress while keeping each lane coherent. + +Alternatives considered: +- One agent rewrites the entire docs tree. Rejected because it couples unrelated doc lanes and makes it harder to maintain area-specific grounding. + +## Risks / Trade-offs + +- [Useful technical detail gets lost during simplification] -> Preserve existing dense docs as source material and require each subagent to read both current docs and relevant code before rewriting. +- [New IA creates dead links or duplicated explanations] -> Define clear page purposes up front and reserve overview pages for navigation rather than repeated deep content. +- [Contrib docs drift away from core docs style] -> Give contrib its own parallel subtree but require the same concepts/guides/reference/architecture split. +- [Maintainer workflow docs become harder to find after leaving `docs/`] -> Choose explicit destination homes during implementation and update contributor-facing entry points at the same time. +- [Subagents write generic docs disconnected from the codebase] -> Require repo inspection in every task and review outputs for code-anchored accuracy. + +## Migration Plan + +1. Create the new docs skeleton and human-facing docs home. +2. Rewrite the root getting-started experience as a single file. +3. Migrate core content into concepts, guides, reference, and architecture lanes. +4. Rebuild `docs/contrib/` using the same reader-intent model. +5. Move maintainer workflow docs out of `docs/` and update their entry points. +6. Remove or redirect obsolete paths and verify the final navigation is coherent. + +Rollback is low-risk because this is documentation-only. If the new shape proves confusing during review, files can be reworked before merge without product-facing compatibility concerns. + +## Open Questions + +- Should contributor-only docs outside `docs/` live primarily in `CONTRIBUTING.md`, in a dedicated contributor-docs subtree, or in `.opencode/` and `AGENTS.md` depending on audience? +- Should `docs/contrib/` include its own separate getting-started page, or is a short reader path in `docs/contrib/README.md` enough? +- Which current concept docs need to split into both a concept page and a guide, instead of being translated one-to-one? diff --git a/openspec/changes/archive/2026-05-18-reorganize-project-docs/proposal.md b/openspec/changes/archive/2026-05-18-reorganize-project-docs/proposal.md new file mode 100644 index 0000000..f96a2a5 --- /dev/null +++ b/openspec/changes/archive/2026-05-18-reorganize-project-docs/proposal.md @@ -0,0 +1,28 @@ +## Why + +The current `docs/` tree mixes human-facing project documentation with maintainer governance, edit workflow rules, and spec-suite authority language. We want `docs/` to read as a description of DaggerML as it exists for human readers, while keeping agent-facing change planning in `openspec/` and moving maintainer workflow material out of the project docs surface. + +## What Changes + +- Reorganize `docs/` around reader intent instead of authority ownership: a docs home, one concise getting-started page, concept docs, guides, reference docs, architecture docs, and a parallel `contrib` docs subtree. +- Rewrite the project docs so they describe DaggerML in user-facing language rather than normative spec language such as document authority, compatibility classes, and mandatory handoff rules. +- Keep `getting-started` as a single compact page that covers installation, first repo, first DAG, and where to go next instead of splitting those basics across multiple tiny files. +- Define specific content expectations for each target doc so the reorganization is not just path churn. +- Move maintainer- and agent-facing material out of `docs/`, including edit pre-read workflow guidance and spec-governance content that belongs with contributor or agent tooling instead. +- Preserve valuable technical content by translating existing docs into concept, reference, and architecture narratives rather than deleting detail. + +## Capabilities + +### New Capabilities +- `human-facing-project-docs`: Define the required information architecture and audience boundary for the repository's human-facing project documentation. + +### Modified Capabilities + +None. + +## Impact + +- Affects `docs/README.md`, most existing `docs/*.md` files, and the organization of `docs/internal/` and `docs/contrib/` content. +- Affects maintainer-oriented docs that currently live under `docs/`, especially `docs/DOC_MAP.md`, `docs/spec/overview.md`, and `docs/testing-taxonomy.md`. +- Does not change runtime behavior, public APIs, CLI semantics, storage formats, or OpenSpec capability behavior. +- Changes how contributors and readers discover project information, so the final docs need clear navigation and consistent audience boundaries. diff --git a/openspec/changes/archive/2026-05-18-reorganize-project-docs/specs/human-facing-project-docs/spec.md b/openspec/changes/archive/2026-05-18-reorganize-project-docs/specs/human-facing-project-docs/spec.md new file mode 100644 index 0000000..6667382 --- /dev/null +++ b/openspec/changes/archive/2026-05-18-reorganize-project-docs/specs/human-facing-project-docs/spec.md @@ -0,0 +1,62 @@ +## ADDED Requirements + +### Requirement: `docs/` SHALL be reserved for human-facing project documentation +The repository SHALL treat `docs/` as the human-facing project documentation surface that describes DaggerML as it exists, while agent-facing change-planning artifacts remain in `openspec/` and maintainer workflow rules live outside `docs/`. + +#### Scenario: Human reader enters the docs tree +- **WHEN** a reader opens `docs/` +- **THEN** the visible content describes the product, its usage, its concepts, or its architecture for humans rather than agent workflow or change-planning procedure + +#### Scenario: Agent-facing planning remains outside project docs +- **WHEN** a reader needs change proposals, implementation tasks, or requirement deltas for a change +- **THEN** those artifacts are found under `openspec/` rather than inside `docs/` + +### Requirement: Project docs SHALL be organized by reader intent +The `docs/` tree SHALL organize its primary navigation around reader intent with a docs home, one getting-started page, concept docs, guides, reference docs, architecture docs, and a contrib subtree using the same broad model. + +#### Scenario: Reader looks for onboarding +- **WHEN** a new reader wants the fastest path to first success +- **THEN** `docs/README.md` points to a single `docs/getting-started.md` page rather than a fragmented getting-started subtree + +#### Scenario: Reader looks for the right kind of information +- **WHEN** a reader wants a mental model, a workflow, an exact command surface, or an internal system explanation +- **THEN** the docs navigation distinguishes those needs through concepts, guides, reference, and architecture sections + +### Requirement: `getting-started` SHALL be one concise page +The project docs SHALL provide a single getting-started page that covers installation, first repository setup, first DAG creation, basic inspection, and next-step links without splitting those basics across multiple introductory files. + +#### Scenario: Reader starts from zero +- **WHEN** a reader follows `docs/getting-started.md` +- **THEN** the page includes enough information to install DaggerML, create or select a repo, create a first DAG, and inspect it with at least one simple command or API example + +### Requirement: Human-facing docs SHALL avoid normative spec voice +Docs under `docs/` SHALL describe the system in reader-facing language and SHALL avoid structuring pages around authority ownership, compatibility classifications, or normative maintenance phrases such as document-level handoff rules. + +#### Scenario: Reader opens a topic doc +- **WHEN** a reader opens a concept, guide, reference, or architecture page under `docs/` +- **THEN** the document leads with explanation of the subject matter instead of an authority or governance preamble + +### Requirement: Existing technical content SHALL be preserved through translation, not path churn +When current docs are reorganized, the implementation SHALL preserve useful technical knowledge by rewriting and reclassifying existing material into concept, guide, reference, or architecture pages rather than merely renaming files or deleting depth. + +#### Scenario: Existing detailed doc is migrated +- **WHEN** a current technical document contains valuable behavioral or architectural explanation +- **THEN** the new docs structure preserves that information in an appropriate human-facing page even if the original path or tone changes + +### Requirement: Maintainer workflow docs SHALL leave `docs/` +Repository-maintenance documents such as edit pre-read maps, spec-governance indexes, and contributor test-taxonomy policy SHALL not remain in the human-facing `docs/` tree after the reorganization. + +#### Scenario: Reader encounters maintainer guidance +- **WHEN** a maintainer needs edit workflow or contributor-policy guidance +- **THEN** that guidance is located in a maintainer-facing location outside `docs/` + +### Requirement: Docs rewrite tasks SHALL be independently assignable to repo-aware subagents +The reorganization plan SHALL divide implementation work into independent documentation tasks whose owners first inspect the existing repo, current docs, and relevant code for the area they are rewriting. + +#### Scenario: Subagent rewrites a docs area +- **WHEN** a subagent is assigned a docs subtree or topic lane +- **THEN** the task instructions require that subagent to read the current docs for that area and inspect the corresponding source modules before producing rewritten docs + +#### Scenario: Parallel docs work proceeds safely +- **WHEN** multiple subagents work on different doc lanes such as concepts, reference, architecture, or contrib +- **THEN** the task boundaries are specific enough that each subagent can make progress independently without redefining the whole docs architecture diff --git a/openspec/changes/archive/2026-05-18-reorganize-project-docs/tasks.md b/openspec/changes/archive/2026-05-18-reorganize-project-docs/tasks.md new file mode 100644 index 0000000..58ce433 --- /dev/null +++ b/openspec/changes/archive/2026-05-18-reorganize-project-docs/tasks.md @@ -0,0 +1,53 @@ +## 1. Establish the new docs skeleton and navigation + +- [x] 1.1 Assign an independent subagent to inspect the current `docs/` tree, `README.md`, and the most central product entrypoints, then rewrite `docs/README.md` as a human-facing docs home with links to `getting-started`, concepts, guides, reference, architecture, and contrib. +- [x] 1.2 Have that same subagent create or align the top-level docs skeleton so the target sections exist with clear reader-facing purposes and no governance-style framing. +- [x] 1.3 Verify the new docs home explains the audience split clearly: `docs/` is for humans, `openspec/` is for change planning. + +## 2. Rewrite the onboarding path as one concise getting-started page + +- [x] 2.1 Assign an independent subagent to read the root `README.md`, current setup-related docs, and the public Python and CLI entrypoints, then write `docs/getting-started.md` as a single page covering installation, repo setup, first DAG creation, basic inspection, and next steps. +- [x] 2.2 Require that subagent to keep the page short and practical, using the real repo commands and API surface rather than generic onboarding prose. +- [x] 2.3 Verify the resulting page stands alone for a new reader without introducing a fragmented getting-started subtree. + +## 3. Rebuild the core concepts lane + +- [x] 3.1 Assign an independent subagent to inspect the current concept-heavy docs and the corresponding core modules, then map the content into target concept pages such as DAGs and nodes, commits and history, refs and namespaces, execution, storage, remotes, and codecs and values. +- [x] 3.2 Require that subagent to preserve the real technical model while rewriting the prose away from authority/invariant boilerplate and toward reader mental models. +- [x] 3.3 Verify the concepts lane explains how the major ideas fit together and avoids duplicating command-reference detail better suited to guides or reference docs. + +## 4. Rebuild the guides lane + +- [x] 4.1 Assign an independent subagent to inspect the current docs and likely workflows across the CLI and Python API, then draft task-oriented guides such as creating and running a DAG, inspecting a repository, working with remotes, storing external data, and troubleshooting common errors. +- [x] 4.2 Require that subagent to base each guide on real workflows already supported by the repo rather than aspirational flows. +- [x] 4.3 Verify the guides lane links outward to concepts and reference instead of trying to absorb all explanatory detail itself. + +## 5. Rebuild the core reference lane + +- [x] 5.1 Assign an independent subagent to inspect the public API, CLI, configuration, and error surfaces in code plus the current docs for those topics, then rewrite them as reader-facing reference docs under `docs/reference/`. +- [x] 5.2 Require that subagent to keep exactness where needed while avoiding spec-governance headings such as authority, handoffs, and compatibility sections. +- [x] 5.3 Verify the reference pages remain tightly aligned with the actual code surfaces and examples. + +## 6. Rebuild the architecture lane + +- [x] 6.1 Assign an independent subagent to inspect the internal modules, ops layer, storage implementation, remote protocol surface, and existing internal docs, then reshape them into architecture docs under `docs/architecture/`. +- [x] 6.2 Require that subagent to explain subsystem relationships, data flow, and layering in human terms while staying grounded in the real module layout. +- [x] 6.3 Verify the architecture lane serves advanced readers and contributors without reverting to a normative spec voice. + +## 7. Rebuild contrib docs as a parallel human-facing subtree + +- [x] 7.1 Assign an independent subagent to inspect `src/daggerml/contrib/**`, the existing `docs/contrib/` set, and any relevant examples/tests, then reorganize contrib docs into a coherent human-facing subtree with a clear start point plus concepts, guides, reference, and architecture sections as needed. +- [x] 7.2 Require that subagent to preserve contrib-specific runtime and API detail while matching the tone and reader-intent model of the main docs. +- [x] 7.3 Verify contrib docs are navigable on their own and clearly connected back to the main docs home. + +## 8. Move maintainer workflow material out of `docs/` + +- [x] 8.1 Assign an independent subagent to inspect `docs/DOC_MAP.md`, `docs/spec/overview.md`, `docs/testing-taxonomy.md`, `AGENTS.md`, `CONTRIBUTING.md`, and `.opencode/`, then propose and execute new homes for maintainer- and agent-facing guidance outside `docs/`. +- [x] 8.2 Require that subagent to preserve the workflow value of those documents while making their audience explicit in their new locations. +- [x] 8.3 Verify the final `docs/` tree no longer contains maintainer workflow rules or spec-governance material. + +## 9. Final coherence pass + +- [x] 9.1 Assign an independent subagent to review the completed docs tree as a reader, checking navigation, cross-links, tone consistency, and audience boundaries across all lanes. +- [x] 9.2 Require that subagent to spot duplicated material, stale links, and sections that still read like internal specs instead of human docs. +- [x] 9.3 Verify the final result presents DaggerML clearly as it exists today and that each lane is grounded in actual repo behavior. diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/.openspec.yaml b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/.openspec.yaml new file mode 100644 index 0000000..8b76914 --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-20 diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/design.md b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/design.md new file mode 100644 index 0000000..29ecf6f --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/design.md @@ -0,0 +1,80 @@ +## Context + +`Dml` currently overloads strings to mean both selectors and exact DB identities. The sharpest examples are DAG and node access methods that accept either `Ref` objects or ref-like strings, plus payloads that return both `id` and `ref` for the same DB-backed object. + +This change keeps string-based lookup where the underlying value is not itself a DB object, and requires `Ref` everywhere else. That preserves ergonomic lookup APIs while making exact-object APIs uniform. + +## Goals / Non-Goals + +**Goals:** +- Make `Ref` the only exact-input and exact-output contract for DB-backed objects on the `Dml` surface. +- Keep selector-style strings for revisions, names, branches, tags, remote URIs, and `index_id` values. +- Split lookup behavior from dereference and mutation behavior so signatures communicate intent. +- Remove duplicated raw DB `id` payload fields where `Ref` already identifies the object. + +**Non-Goals:** +- Converting runtime indexes from string ids to `Ref` objects. +- Removing selector-based repository workflows such as revision, branch, or DAG-name lookup. +- Changing lower-level storage identity rules or namespace formats. + +## Decisions + +### 1. Exact DB object contracts use `Ref` only +Methods that operate on exact DB-backed objects will require `Ref` and will no longer coerce `"ns:..."` strings into refs. + +Why: +- It removes ambiguous string contracts. +- It aligns exact-input methods with the typed storage model. + +Alternative considered: +- Keep `str | Ref` and document stricter meaning. Rejected because the runtime contract would still be ambiguous at the call site. + +### 2. Selector contracts remain string-based +Methods whose job is lookup or navigation will continue accepting strings for revisions, names, branches, tags, remote URIs, and `index_id` values. + +Why: +- Those values are not DB objects. +- They are naturally human-authored selectors. + +Alternative considered: +- Introduce new wrapper types for selectors. Rejected as unnecessary surface expansion. + +### 3. `Dml` separates lookup from dereference +Lookup methods may accept selector strings and return refs. Dereference and mutation methods will accept refs directly. + +Why: +- It makes the API boundary legible. +- It gives `daggerml.api` a clean place to remain ergonomic while `_internal.Dml` stays strict. + +Alternative considered: +- Preserve combined lookup-and-dereference methods. Rejected because they force selector parsing into object-read APIs. + +### 4. DB-backed payload identity uses `ref`, not duplicate `id` +Payloads for commits, DAGs, nodes, and other DB-backed objects will expose `Ref` as the canonical identity and drop duplicate raw `id` fields. Non-DB handles like `index_id` remain strings. + +Why: +- It removes two ways to identify the same object. +- It matches the input-side contract. + +Alternative considered: +- Keep both `id` and `ref` for convenience. Rejected because it preserves the exact ambiguity this change is trying to eliminate. + +## Risks / Trade-offs + +- [Breaking callers that pass `.to` strings] -> Update `daggerml.api`, CLI routing, and contract tests together. +- [More explicit multi-step lookup flows] -> Keep selector-oriented methods that return refs so callers can compose lookup and dereference clearly. +- [Spec drift between `_internal.Dml` and higher-level wrappers] -> Make `daggerml.api` explicitly responsible for ergonomic lookup composition. + +## Migration Plan + +1. Narrow `Dml` and `dml_resolution` signatures and runtime validation. +2. Update payload shaping so DB-backed objects stop returning duplicate raw ids. +3. Update `daggerml.api` and tests to pass `Ref` directly instead of `.to` strings. +4. Update docs and generated help text to describe the new contract. + +Rollback is straightforward: restore the previous coercion paths and payload fields if downstream compatibility issues surface before release. + +## Open Questions + +- Which selector-returning helpers should remain on `_internal.Dml` versus be pushed entirely into `daggerml.api` convenience flows? +- Whether commit payloads should add a `ref` field or rely only on existing commit-typed fields returned from selector workflows. diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/proposal.md b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/proposal.md new file mode 100644 index 0000000..99cf5df --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/proposal.md @@ -0,0 +1,28 @@ +## Why + +`Dml` currently mixes DB object identity across `Ref`, ref-like strings, and raw `id` fields. That makes method contracts hard to predict and pushes selector parsing into APIs that should only accept exact object handles. + +## What Changes + +- **BREAKING** Require `Ref` objects for all `Dml` inputs that represent DB objects. +- **BREAKING** Stop accepting ref-like strings such as `"dag:..."`, `"node:..."`, and `"commit:..."` where the method contract is for an exact DB object. +- Keep string inputs for non-DB selectors and labels such as revisions, DAG names, node names, branches, tags, remote URIs, and `index_id` values. +- Narrow `Dml` read and mutation surfaces so lookup-oriented methods are selector-based and dereference/mutation methods are ref-based. +- Remove duplicated raw DB `id` fields from `Dml` payloads and return `Ref` objects as the canonical DB identity. + +## Capabilities + +### New Capabilities + +None. + +### Modified Capabilities + +- `unified-dml-surface`: tighten the shared `Dml` contract so DB object inputs and outputs are ref-based, while non-DB selectors remain string-based. +- `dml-resolution`: limit the shared resolution layer to selector-to-ref lookup flows and remove ref-like string coercion from exact-input APIs. + +## Impact + +- Affected code: `src/daggerml/_internal/dml.py`, `src/daggerml/_internal/dml_resolution.py`, `src/daggerml/api.py`, and callers/tests that pass `.to` strings instead of `Ref` objects. +- Affected APIs: shared internal `Dml`, high-level Python wrappers that delegate through `Dml`, and any generated CLI/help metadata derived from signatures. +- Affected payloads: DAG, node, and commit payload shapes that currently duplicate `id` and `ref` for DB-backed objects. diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/dml-resolution/spec.md b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/dml-resolution/spec.md new file mode 100644 index 0000000..1f990de --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/dml-resolution/spec.md @@ -0,0 +1,45 @@ +## MODIFIED Requirements + +### Requirement: DAG resolution returns canonical dag refs +The DML resolution layer SHALL accept DAG lookup inputs only as a DAG name combined with a revision selector, and it SHALL resolve the result to a canonical dag `Ref`. + +#### Scenario: Resolve a named dag from a revision +- **WHEN** a caller resolves a DAG name together with a commit-reachable revision selector +- **THEN** the resolution layer returns the dag `Ref` mapped to that name in the selected commit + +#### Scenario: Reject explicit dag ref coercion input +- **WHEN** a caller passes a plain `"dag:..."` string to a DAG lookup resolver +- **THEN** the resolution layer raises `DmlRepoError` instead of coercing that string into a `Ref` + +### Requirement: Node resolution accepts named lookups only +The DML resolution layer SHALL accept node lookup inputs only as node names resolved through DAG context or revision-reachable DAG discovery, and it SHALL return a canonical node `Ref`. + +#### Scenario: Resolve a named node lookup +- **WHEN** a caller resolves a node name together with sufficient DAG context +- **THEN** the resolution layer returns the named node as a `Ref` + +#### Scenario: Reject node-id style selector coercion +- **WHEN** a caller passes a plain string that matches a canonical node-id style selector such as `node-literal:abc123` +- **THEN** the resolution layer raises `DmlRepoError` instead of interpreting that string as a node `Ref` + +### Requirement: Ambiguous node lookup requires dag disambiguation +The DML resolution layer MUST require an explicit DAG selector when a name-based node lookup cannot be resolved unambiguously from the available context, and it MUST fail with `DmlRepoError` instead of guessing. + +#### Scenario: Reject ambiguous named node lookup +- **WHEN** a caller resolves a node name without explicit DAG context and the available context does not identify a single DAG +- **THEN** the resolution layer raises `DmlRepoError` instructing the caller to provide DAG context + +#### Scenario: Allow unambiguous lookup without explicit dag selector +- **WHEN** a caller resolves a node name without a DAG selector and the available context identifies exactly one matching DAG +- **THEN** the resolution layer returns the matching node `Ref` + +### Requirement: DML delegates selector resolution to the shared resolution layer +The `dml.py` orchestration layer SHALL use shared helpers from `dml_resolution.py` only for selector-to-ref lookup flows and SHALL bypass that layer for workflows that already require exact `Ref` inputs. + +#### Scenario: DML resolves a lookup selector +- **WHEN** DML code needs to resolve a DAG name or node name for a lookup workflow +- **THEN** it uses the shared resolution layer and consumes the returned `Ref` + +#### Scenario: DML bypasses resolution for exact ref input +- **WHEN** a `Dml` workflow already requires an exact `Ref` object +- **THEN** it validates that object directly instead of routing it through selector parsing helpers diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/unified-dml-surface/spec.md b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..896e307 --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/specs/unified-dml-surface/spec.md @@ -0,0 +1,44 @@ +## ADDED Requirements + +### Requirement: Shared `Dml` exact DB object contracts use `Ref` +The shared `Dml` surface SHALL require `Ref` objects for caller inputs that represent exact DB-backed objects, and it SHALL return `Ref` objects as the canonical identity for DB-backed objects in its payloads. + +#### Scenario: Exact DAG access requires `Ref` +- **WHEN** a caller invokes a `Dml` method whose contract is to dereference an exact DAG object +- **THEN** the method requires a `Ref` +- **AND** it does not accept a plain `"dag:..."` string as a substitute + +#### Scenario: Exact node access requires `Ref` +- **WHEN** a caller invokes a `Dml` method whose contract is to dereference an exact node object +- **THEN** the method requires a `Ref` +- **AND** it does not accept a plain `"node:..."` string as a substitute + +#### Scenario: Non-DB selectors remain strings +- **WHEN** a caller provides a revision selector, DAG name, node name, branch, tag, remote URI, or `index_id` +- **THEN** the shared `Dml` surface continues to accept that value as a string + +#### Scenario: DB-backed payloads use ref identity +- **WHEN** a shared `Dml` payload includes the identity of a commit, DAG, node, or other DB-backed object +- **THEN** that identity is represented by `Ref` +- **AND** the payload does not duplicate the same DB identity as a separate raw `id` string + +## MODIFIED Requirements + +### Requirement: `Dml` is the only fuzzy-selector boundary +The shared `Dml` class SHALL accept fuzzy selector strings only for workflows whose contract is lookup or repository navigation, and it SHALL require exact `Ref` objects for workflows whose contract is direct dereference or mutation of DB-backed objects. + +#### Scenario: Revision selector resolves inside Dml +- **WHEN** a caller passes a supported revision string such as `HEAD~1` to a shared `Dml` repository method +- **THEN** the `Dml` method resolves it through the fuzzy-resolution submodule and lower-level ops receive only exact values + +#### Scenario: DAG-name lookup resolves inside Dml +- **WHEN** a caller passes a DAG name to a shared `Dml` lookup workflow that documents name-based selection +- **THEN** the shared `Dml` method performs that selector resolution through the fuzzy-resolution submodule and lower-level ops do not parse that caller-facing form + +#### Scenario: Exact DB-object workflow rejects fuzzy string grammar +- **WHEN** a caller passes a ref-like string such as `dag:abc123`, `node-literal:abc123`, or `commit:abc123` to a shared `Dml` workflow whose contract is for an exact DB-backed object +- **THEN** the method fails rather than coercing that string into a `Ref` + +#### Scenario: Unsupported fuzzy grammar is rejected at Dml boundary +- **WHEN** a caller passes a selector form that is not documented by the redesigned CLI contracts +- **THEN** the shared `Dml` method fails rather than inventing additional grammar diff --git a/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/tasks.md b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/tasks.md new file mode 100644 index 0000000..d27b4d2 --- /dev/null +++ b/openspec/changes/archive/2026-05-19-strict-dml-ref-contract/tasks.md @@ -0,0 +1,16 @@ +## 1. Tighten core Dml contracts + +- [x] 1.1 Narrow `src/daggerml/_internal/dml.py` method signatures so exact DB-object workflows require `Ref` and selector workflows keep string inputs. +- [x] 1.2 Remove ref-like string coercion from exact-input paths and add direct namespace validation for exact `Ref` inputs. +- [x] 1.3 Update DML payload shaping so DB-backed objects expose `Ref` as canonical identity and stop duplicating raw `id` fields. + +## 2. Narrow shared resolution behavior + +- [x] 2.1 Update `src/daggerml/_internal/dml_resolution.py` so it resolves lookup selectors only and no longer treats ref-like strings as exact refs. +- [x] 2.2 Keep DAG-name and node-name lookup flows working with revision and DAG context while rejecting ambiguous or unsupported selector forms. + +## 3. Update callers and tests + +- [x] 3.1 Update `src/daggerml/api.py` to pass `Ref` objects directly into strict `Dml` methods instead of `.to` strings. +- [x] 3.2 Update contract and unit tests to use `Ref` inputs for exact DB-object workflows and to assert failures for ref-like string inputs. +- [x] 3.3 Refresh any affected docs or generated help expectations so the new contract is documented consistently. diff --git a/openspec/config.yaml b/openspec/config.yaml new file mode 100644 index 0000000..392946c --- /dev/null +++ b/openspec/config.yaml @@ -0,0 +1,20 @@ +schema: spec-driven + +# Project context (optional) +# This is shown to AI when creating artifacts. +# Add your tech stack, conventions, style guides, domain knowledge, etc. +# Example: +# context: | +# Tech stack: TypeScript, React, Node.js +# We use conventional commits +# Domain: e-commerce platform + +# Per-artifact rules (optional) +# Add custom rules for specific artifacts. +# Example: +# rules: +# proposal: +# - Keep proposals under 500 words +# - Always include a "Non-goals" section +# tasks: +# - Break tasks into chunks of max 2 hours diff --git a/openspec/spec-overview.md b/openspec/spec-overview.md new file mode 100644 index 0000000..e549697 --- /dev/null +++ b/openspec/spec-overview.md @@ -0,0 +1,34 @@ +# DaggerML Spec Overview + +Audience: maintainers and agents working with the repository's OpenSpec capability set. + +Use this file to see which documents currently own each high-level concept. It is a governance index for the spec suite, not product documentation. + +## Authority Mapping + +When a concept is not listed here, treat it as unresolved rather than guessing from proximity or naming. + +| concept | authority | scope | +| --- | --- | --- | +| Public API behavior | `docs/reference/python-api.md`, `docs/concepts/dags-and-nodes.md`, `docs/reference/errors.md` | Public Python API semantics, default-runtime helpers, node-wrapper selection, DAG-call staging behavior, and user-visible API errors. | +| CLI behavior | `docs/reference/cli.md` | User-visible CLI commands, arguments, and CLI semantics. | +| Execution and runtime behavior | `docs/reference/configuration.md`, `docs/concepts/execution.md`, `docs/architecture/remote-protocol.md` | Runtime configuration, execution flow, adapter-boundary payloads, and execution lifecycle semantics. | +| Cache publication and cache identity | `docs/concepts/execution.md`, `docs/architecture/ops-layer.md`, `docs/architecture/remote-protocol.md` | Runtime cache publication behavior, argv-derived cache identity, and the remote records that preserve execution state. | +| Storage and object persistence | `docs/concepts/storage.md`, `docs/concepts/refs-and-namespaces.md`, `docs/architecture/storage-internals.md`, `docs/guides/store-and-load-external-data.md` | Storage model, reference handling, GC-adjacent storage behavior, and external data persistence semantics. | +| Commit and DAG semantics | `docs/concepts/commits-and-history.md`, `docs/concepts/dags-and-nodes.md`, `docs/architecture/ops-layer.md` | Commit objects, DAG model semantics, and the operation-layer responsibilities that create and read them. | +| Remote sync and protocol | `docs/concepts/remotes.md`, `docs/architecture/remote-protocol.md`, `docs/architecture/ops-layer.md` | Remote lifecycle, remote schemas, sync protocol semantics, and remote operations behavior. | +| Codec encoding and import/export behavior | `docs/concepts/codecs-and-values.md` | Codec registry behavior, encoding rules, and import/export semantics. | +| Contrib API surface | `docs/contrib/reference/python-api.md` | `daggerml.contrib.api` decorators, delayed actions, and execution helpers. | +| Contrib literal codecs and dataframe serialization | `docs/contrib/reference/s3-and-codecs.md` | Contrib-owned codec behavior and dataframe serialization semantics. | +| Contrib prebuilt funks | `docs/contrib/reference/python-api.md` | Contrib-owned prebuilt function contracts. | +| Contrib testing helpers | `docs/contrib/reference/python-api.md` | Testing helpers intended for author-code unit tests. | +| Contrib runtime lifecycle | `docs/contrib/concepts/runtime.md`, `docs/contrib/architecture/execution-flow.md`, `docs/contrib/architecture/supervisor-and-state.md` | Supervisor launch, executor start/poll/cleanup, `ExecutionState` transitions, adapter/executor pairing, and deployment-specific execution-graph behavior. | +| Contrib plugin packaging and discovery | `docs/contrib/reference/runtime-surfaces.md` | Adapter and executor registry contracts, plugin packaging, and discovery behavior. | +| Contrib runtime diagnostics and status surfaces | `docs/contrib/reference/runtime-surfaces.md` | Contrib runtime status and diagnostics APIs and registration visibility. | +| Contrib S3 utility behavior | `docs/contrib/reference/s3-and-codecs.md` | `S3Store`, S3 URI normalization, content-addressed S3 object helpers, JSON helpers, tar helpers, and extraction safety rules. | + +## Handoffs + +- Human-facing product docs live under `docs/`. +- Path-based pre-read requirements live in `DOC_MAP.md`. +- Change proposals, designs, and task lists live under `openspec/changes/`. diff --git a/openspec/specs/adapter-cli-resolution/spec.md b/openspec/specs/adapter-cli-resolution/spec.md new file mode 100644 index 0000000..5b953b2 --- /dev/null +++ b/openspec/specs/adapter-cli-resolution/spec.md @@ -0,0 +1,55 @@ +## ADDED Requirements + +### Requirement: Runtime adapter execution SHALL use only concrete command-line adapter identities +Any runnable that reaches adapter execution SHALL carry an adapter value that is directly command-line-callable from the runtime environment or is an explicit executable path. Runtime adapter execution SHALL NOT reinterpret that value as symbolic sugar, SHALL NOT consult the adapter registry to repair it, and SHALL NOT fall back to Python import-based `cli()` invocation. + +#### Scenario: Built-in local adapter executes as a concrete command +- **WHEN** runtime execution receives a runnable with `adapter = "dml-local-adapter"` +- **THEN** it invokes `dml-local-adapter` as a command-line program +- **AND** it does not re-resolve `dml-local-adapter` through the adapter registry + +#### Scenario: Plugin adapter executes without requiring a `dml-` prefix +- **WHEN** runtime execution receives a runnable with `adapter = "podman-adapter"` +- **THEN** it treats `podman-adapter` as a valid concrete adapter command if callable from the runtime environment +- **AND** it does not require the adapter string to start with `dml-` + +#### Scenario: Explicit executable path is accepted +- **WHEN** runtime execution receives a runnable with `adapter = "/opt/acme/bin/build-adapter"` +- **THEN** it invokes that path directly as the adapter command + +#### Scenario: Test adapter path must itself be executable +- **WHEN** a test or fixture passes a filesystem path such as `tests/assets/internal_fn/python-fork-adapter.py` as `runnable.adapter` +- **THEN** that file is expected to be directly executable by the runtime, including any required executable permission bits +- **AND** runtime execution does not repair a non-executable adapter path through Python import fallback + +#### Scenario: Missing concrete adapter command fails closed +- **WHEN** runtime execution receives a runnable with a concrete adapter command that is not callable from the runtime environment +- **THEN** execution fails with an adapter-not-found error +- **AND** the runtime does not attempt Python import-based recovery + +### Requirement: Symbolic adapter names SHALL resolve before runtime execution +Author-facing APIs MAY accept symbolic adapter names as sugar, but the adapter registry and runnable-resolution flow SHALL resolve that sugar to a concrete command-line adapter identity before runtime execution begins. + +#### Scenario: Built-in sugar resolves to the canonical local adapter command +- **WHEN** author-facing code specifies `adapter = "local"` +- **THEN** runnable resolution produces a runtime runnable with `adapter = "dml-local-adapter"` + +#### Scenario: Built-in sugar resolves to the canonical lambda adapter command +- **WHEN** author-facing code specifies `adapter = "lambda"` +- **THEN** runnable resolution produces a runtime runnable with `adapter = "dml-lambda-adapter"` + +#### Scenario: Plugin-defined sugar resolves to a non-`dml-` adapter command +- **WHEN** a plugin registers symbolic adapter sugar `gpu` +- **THEN** runnable resolution MAY produce a runtime runnable with `adapter = "podman-adapter"` +- **AND** runtime execution treats that resolved command as canonical for that runnable + +### Requirement: Builtin execution SHALL use an explicit empty-adapter exception +The only runtime execution path that MAY accept `adapter = ""` is the explicit builtin-function branch where the runtime detects builtin execution directly and does not shell out to an adapter process. + +#### Scenario: Builtin function bypasses adapter execution +- **WHEN** runtime execution handles a builtin function such as `get` or `concat` +- **THEN** it uses the builtin execution branch instead of spawning an adapter process + +#### Scenario: Empty adapter is not accepted for non-builtin execution +- **WHEN** a non-builtin runnable reaches adapter execution with `adapter = ""` +- **THEN** execution fails instead of treating the empty string as a command-line adapter identity diff --git a/openspec/specs/admin-cli-controls/spec.md b/openspec/specs/admin-cli-controls/spec.md new file mode 100644 index 0000000..69e9391 --- /dev/null +++ b/openspec/specs/admin-cli-controls/spec.md @@ -0,0 +1,77 @@ +### Requirement: Administrative CLI flows are grouped under `dml admin` +Low-frequency maintenance and recovery commands SHALL be exposed under `dml admin` rather than as top-level porcelain commands. + +#### Scenario: Admin help groups maintenance commands +- **WHEN** a user inspects `dml admin` help +- **THEN** index management, cache invalidation, remote discovery, remote garbage collection, and local garbage collection appear under `dml admin` + +### Requirement: Admin index list returns indexes with commit info +`dml admin index list` SHALL return every live index together with commit information for the commit each index currently points to. + +#### Scenario: Index list includes commit summaries +- **WHEN** a user runs `dml admin index list` +- **THEN** the command returns JSON with an `indexes` field +- **AND** each index entry includes its identifier and commit information for the pointed-to commit + +### Requirement: Admin index get returns full index inspection payload +`dml admin index get ` SHALL return index inspection data including commit information for the commit the index points to, rather than only a commit identifier. + +#### Scenario: Index get includes commit details +- **WHEN** a user runs `dml admin index get idx1` +- **THEN** the command returns JSON with an `index` object +- **AND** that object includes commit metadata for the pointed-to commit + +### Requirement: Admin index delete removes an index +`dml admin index delete ` SHALL delete the selected index and report the deletion result as JSON. + +#### Scenario: Index delete reports success +- **WHEN** a user runs `dml admin index delete idx1` +- **THEN** the command returns JSON containing `index` and `deleted` + +### Requirement: Admin cache invalidation accepts exact cache keys only +`dml admin cache invalidate [more cache keys]` SHALL accept one or more exact cache keys and SHALL NOT accept DAG refs, argv refs, or other selector types. + +#### Scenario: Cache invalidation accepts multiple exact keys +- **WHEN** a user runs `dml admin cache invalidate ck1 ck2` +- **THEN** the command invalidates those exact cache keys +- **AND** returns JSON containing `cache_keys` and `invalidated` + +#### Scenario: Cache invalidation rejects non-key selector forms +- **WHEN** a user runs `dml admin cache invalidate dag:abc123` +- **THEN** the command fails because admin cache invalidation accepts exact cache keys only + +### Requirement: Admin remote list can list projects or one project's refs +`dml admin remote list` SHALL support two modes through one command shape. + +Without a project argument, it SHALL list remote projects as canonical `dml:///` URIs and MAY filter by owner. With a `dml:///` argument, it SHALL list the remote branches and tags for that project. + +#### Scenario: Remote list returns projects +- **WHEN** a user runs `dml admin remote list` +- **THEN** the command returns JSON with a `projects` field containing canonical project URIs + +#### Scenario: Remote list filters by owner +- **WHEN** a user runs `dml admin remote list --owner alice` +- **THEN** the command returns only projects owned by `alice` + +#### Scenario: Remote list returns project refs +- **WHEN** a user runs `dml admin remote list dml://alice/demo` +- **THEN** the command returns JSON containing `project`, `branches`, and `tags` + +### Requirement: Admin remote GC performs remote maintenance +`dml admin remote gc` SHALL perform remote maintenance for the configured remote, including remote GC of CAS/refs state and remote transport cleanup, and SHALL report the result as JSON. + +#### Scenario: Remote GC reports cleanup summary +- **WHEN** a user runs `dml admin remote gc` +- **THEN** the command returns JSON summarizing deleted remote refs, CAS objects, and transport objects + +### Requirement: Admin local GC supports dry-run inspection +`dml admin gc` SHALL garbage-collect unreachable local objects. When `--dry-run` is provided, it SHALL report what would be deleted without deleting it. + +#### Scenario: Local GC deletes unreachable objects +- **WHEN** a user runs `dml admin gc` +- **THEN** the command returns JSON describing deleted local objects + +#### Scenario: Local GC dry run reports orphans +- **WHEN** a user runs `dml admin gc --dry-run` +- **THEN** the command returns JSON containing `dry_run`, `would_delete`, and `orphans` +- **AND** the command does not delete local objects diff --git a/openspec/specs/cast-free-authoring-and-tests/spec.md b/openspec/specs/cast-free-authoring-and-tests/spec.md new file mode 100644 index 0000000..1d311e6 --- /dev/null +++ b/openspec/specs/cast-free-authoring-and-tests/spec.md @@ -0,0 +1,17 @@ +### Requirement: Contrib authoring helpers SHALL not use `cast(..., Any)` no-ops +The system SHALL preserve the current `api.dagclass`, `api.run`, and `api.funkify` behavior without using `cast(..., Any)` in their implementation. + +#### Scenario: Dagclass decoration still works after cast removal +- **WHEN** `api.dagclass` decorates and runs a class that previously passed through `cast(..., Any)` sites +- **THEN** the existing decoration, compilation, and runtime behavior remain unchanged + +#### Scenario: Funkify and dag staging still work after cast removal +- **WHEN** contrib runnable values are staged through the existing `api.funkify` and DAG execution flow +- **THEN** the same runtime results are produced without routing those values through `cast(..., Any)` + +### Requirement: Tests SHALL not use `cast(..., Any)` no-ops +The test suite SHALL validate contrib and configuration behavior without using `cast(..., Any)` to pass values through unchanged. + +#### Scenario: Invalid funkify input remains rejected without `Any` casts +- **WHEN** test coverage passes a concrete invalid input to `api.funkify` +- **THEN** the API still raises the existing invalid-input repository error diff --git a/openspec/specs/cli-thin-interface/spec.md b/openspec/specs/cli-thin-interface/spec.md new file mode 100644 index 0000000..ea246bd --- /dev/null +++ b/openspec/specs/cli-thin-interface/spec.md @@ -0,0 +1,85 @@ +### Requirement: CLI handlers are transport-only +The CLI command layer SHALL be limited to discovering command shape from the public `Dml` surface, parsing command inputs, invoking domain interfaces, and serializing outputs, and SHALL NOT contain business workflow or domain decision logic. + +#### Scenario: CLI parses and delegates +- **WHEN** a user invokes any CLI command +- **THEN** the handler parses flags and arguments, calls a domain entrypoint, and formats the returned result without domain branching in the CLI layer + +#### Scenario: Generated command discovery remains transport-only +- **WHEN** the CLI inspects `Dml` signatures, annotations, and docstrings to build commands +- **THEN** that inspection is used only to derive transport behavior and not to re-implement domain workflow rules in the CLI layer + +### Requirement: One generated CLI module owns the public transport surface +The `dml` CLI SHALL be implemented through a single generated transport module rather than a package of hand-maintained per-command parser modules. + +#### Scenario: Public CLI entrypoint resolves through one module +- **WHEN** the `dml` script entrypoint is loaded +- **THEN** it imports one CLI module that generates and dispatches the public command surface + +### Requirement: Generated CLI command exposure follows the public `Dml` surface +The CLI SHALL expose the public CLI-generatable `Dml` surface directly rather than maintaining a smaller curated command subset. + +#### Scenario: Runtime workflows become CLI-visible when generatable +- **WHEN** a public `dml.runtime` method uses only CLI-generatable parameter types +- **THEN** the generated CLI exposes that runtime workflow as a command + +#### Scenario: JSON output is uniform across generated commands +- **WHEN** any generated CLI command succeeds or fails +- **THEN** the CLI emits JSON rather than mixing JSON and plain-text command modes + +### Requirement: Domain logic resides outside CLI modules +Any behavior that determines domain outcomes (state transitions, merge/reconcile rules, execution sequencing, or validation beyond input shape/type checks) MUST execute in API/internal modules rather than `src/daggerml/_cli/**`. + +#### Scenario: Decision logic extraction +- **WHEN** a command path requires branching based on repository or execution state +- **THEN** the branching logic executes in a non-CLI module and CLI code only forwards parsed inputs and surfaces returned outcomes + +### Requirement: CLI output contract remains stable through documented compatibility changes +Refactoring to enforce a thin CLI boundary MUST preserve documented user-visible command semantics, including success output structure and failure signaling, except where a change explicitly defines a breaking CLI compatibility update. + +#### Scenario: Refactor preserves behavior outside documented breaks +- **WHEN** CLI logic is moved into domain modules for commands whose public contract is unchanged by an approved change +- **THEN** command outputs and exit outcomes remain equivalent for existing supported invocations + +#### Scenario: Approved CLI redesign may replace old command contracts +- **WHEN** an approved change explicitly redefines the public CLI grammar and JSON payloads +- **THEN** the implementation MAY remove prior command names and prior output payload shapes for the affected commands + +### Requirement: CLI tests focus on interface behavior +CLI-focused tests SHALL validate input parsing, delegation wiring, output serialization, and exit signaling, while domain behavior assertions SHALL be covered in non-CLI test suites. + +#### Scenario: Test responsibility split +- **WHEN** adding or updating tests for a refactored command +- **THEN** CLI tests assert transport concerns only and domain behavior checks appear in API/internal tests + +### Requirement: CLI project and remote override flags use canonical config-shaped names +The CLI SHALL expose explicit project and remote override flags using the canonical configuration naming represented by the shared resolver, rather than frontend-specific aliases. + +#### Scenario: Top-level project override uses canonical name +- **WHEN** a user passes an explicit project directory to any command +- **THEN** the CLI accepts `--project-home ` as the top-level override flag +- **AND** the CLI does not advertise `--repo` as the supported flag name + +#### Scenario: Top-level remote override uses canonical name +- **WHEN** a user passes an explicit remote project root to any command +- **THEN** the CLI accepts `--remote-root ` as the top-level override flag +- **AND** the CLI does not advertise `--remote-uri` as the supported flag name + +### Requirement: CLI guidance uses canonical flag names consistently +CLI help text, examples, and normalized user-facing recovery hints SHALL use the same canonical flag names as the parser surface. + +#### Scenario: Help examples show canonical overrides +- **WHEN** a user opens top-level or subcommand help for commands that mention explicit config overrides +- **THEN** the examples and help text refer to `--project-home` and `--remote-root` instead of legacy aliases + +#### Scenario: Missing project-home hint uses canonical flag name +- **WHEN** command execution fails because no local project path can be resolved +- **THEN** the structured error hint instructs the user to pass `--project-home PATH` or set `DML_PROJECT_HOME` + +### Requirement: Shared public flag names do not create ambiguous CLI behavior +When the CLI uses the same canonical public flag spelling in different parser scopes, command dispatch SHALL preserve the intended meaning for each command path. + +#### Scenario: Init keeps its own remote-root input without shadowing the top-level override +- **WHEN** the CLI exposes both a top-level `--remote-root` option and `init --remote-root` +- **THEN** parsing and command execution keep those inputs distinguishable +- **AND** `init` continues to forward its own `--remote-root` value to bootstrap project remote configuration diff --git a/openspec/specs/codec-normalization/spec.md b/openspec/specs/codec-normalization/spec.md new file mode 100644 index 0000000..def0d9e --- /dev/null +++ b/openspec/specs/codec-normalization/spec.md @@ -0,0 +1,65 @@ +## ADDED Requirements + +### Requirement: Codec logic has a single owning module +The system SHALL define `src/daggerml/codecs.py` as the only module that contains codec logic, codec types, codec registry behavior, plugin loading behavior, and built-in codec implementations. + +#### Scenario: Internal callers import codec behavior from the unified module +- **WHEN** internal staging code needs codec registration or codec application behavior +- **THEN** it imports that behavior from `daggerml.codecs` +- **AND** `daggerml._internal.*` does not define codec logic of its own + +#### Scenario: Built-in codecs live in the unified module +- **WHEN** the system provides built-in codec behavior for `Node` values or delayed-action values +- **THEN** those codec implementations are defined in `daggerml.codecs` + +### Requirement: Stage 1 preserves current codec call semantics +During Stage 1, the system SHALL continue to invoke codec behavior from internal staging call sites using `CodecContext`, while sourcing that behavior from `daggerml.codecs`. + +#### Scenario: Literal staging still applies codecs through internal call sites +- **WHEN** `_internal` literal staging normalizes a value during Stage 1 +- **THEN** it applies codecs through `daggerml.codecs` +- **AND** it passes `CodecContext` to codec `encode(...)` + +#### Scenario: Function staging still applies codecs through internal call sites +- **WHEN** `_internal` function staging normalizes argv or kwargv values during Stage 1 +- **THEN** it applies codecs through `daggerml.codecs` +- **AND** it passes `CodecContext` to codec `encode(...)` + +#### Scenario: Codec-local failures are translated at the internal boundary +- **WHEN** codec application fails during Stage 1 +- **THEN** `daggerml.codecs` raises a codec-local error type +- **AND** the `_internal` caller translates that failure into the repository-domain error surface it already exposes + +### Requirement: Stage 2 codecs receive Dag instances +During Stage 2, the codec plugin contract SHALL pass `daggerml.api.Dag` into codec `encode(...)` instead of `CodecContext`. + +#### Scenario: Built-in codec receives Dag +- **WHEN** a built-in codec encodes a value during Stage 2 +- **THEN** its `encode(...)` method receives the active `Dag` instance + +#### Scenario: Plugin codec receives Dag +- **WHEN** a plugin codec loaded from the `daggerml.codecs` entry-point group encodes a value during Stage 2 +- **THEN** its `encode(...)` method receives the active `Dag` instance + +### Requirement: Dag owns recursive codec normalization in Stage 2 +During Stage 2, `daggerml.api.Dag` SHALL own recursive codec normalization and insertion for values accepted by public staging and call-entry methods. + +#### Scenario: Dag.put normalizes recursively before runtime staging +- **WHEN** `Dag.put(value)` is called during Stage 2 +- **THEN** `Dag` recursively applies codecs and normalizes nested values before delegating to runtime literal staging + +#### Scenario: Dag.call inserts callable and arguments before execution +- **WHEN** `Dag.call(fn, *args, **kwargs)` is called during Stage 2 +- **THEN** `Dag` inserts the callable, positional arguments, and keyword argument values through the codec-driven normalization path before invoking runtime function staging + +#### Scenario: Node remains a codec during Dag-owned normalization +- **WHEN** a `Node` value is encountered during Stage 2 normalization +- **THEN** the system handles it through the built-in `Node` codec rather than through a special non-codec rule + +### Requirement: Codec plugins remain discoverable through the existing entry-point group +The system SHALL continue to load codec plugins from the `daggerml.codecs` entry-point group across both migration stages. + +#### Scenario: Entry-point group remains stable +- **WHEN** codec plugins are discovered after this change +- **THEN** discovery uses the `daggerml.codecs` entry-point group +- **AND** plugin loading preserves deterministic ordering and re-encode behavior diff --git a/openspec/specs/dml-resolution/spec.md b/openspec/specs/dml-resolution/spec.md new file mode 100644 index 0000000..3310e76 --- /dev/null +++ b/openspec/specs/dml-resolution/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: Revision resolution returns canonical commit refs +The DML resolution layer SHALL accept supported revision selectors, including direct commit refs, commit ids, `HEAD` ancestry selectors, branch names, and supported `dml://` revision URIs, and SHALL resolve them to a canonical commit `Ref`. + +#### Scenario: Resolve a symbolic revision selector +- **WHEN** a caller resolves a supported symbolic revision selector such as `HEAD`, `HEAD~1`, a branch name, or a supported `dml://` URI +- **THEN** the resolution layer returns the corresponding commit `Ref` + +#### Scenario: Reject an invalid revision selector +- **WHEN** a caller resolves a revision selector that is empty, malformed, or points to an unsupported object namespace +- **THEN** the resolution layer raises `DmlRepoError` + +### Requirement: DAG resolution returns canonical dag refs +The DML resolution layer SHALL accept DAG lookup inputs only as a DAG name combined with a revision selector, and it SHALL resolve the result to a canonical dag `Ref`. + +#### Scenario: Resolve a named dag from a revision +- **WHEN** a caller resolves a DAG name together with a commit-reachable revision selector +- **THEN** the resolution layer returns the dag `Ref` mapped to that name in the selected commit + +#### Scenario: Reject explicit dag ref coercion input +- **WHEN** a caller passes a plain `"dag:..."` string to a DAG lookup resolver +- **THEN** the resolution layer raises `DmlRepoError` instead of coercing that string into a `Ref` + +### Requirement: Node resolution accepts named lookups only +The DML resolution layer SHALL accept node lookup inputs only as node names resolved through DAG context or revision-reachable DAG discovery, and it SHALL return a canonical node `Ref`. + +#### Scenario: Resolve a named node lookup +- **WHEN** a caller resolves a node name together with sufficient DAG context +- **THEN** the resolution layer returns the named node as a `Ref` + +#### Scenario: Reject node-id style selector coercion +- **WHEN** a caller passes a plain string that matches a canonical node-id style selector such as `node-literal:abc123` +- **THEN** the resolution layer raises `DmlRepoError` instead of interpreting that string as a node `Ref` + +### Requirement: Ambiguous node lookup requires dag disambiguation +The DML resolution layer MUST require an explicit DAG selector when a name-based node lookup cannot be resolved unambiguously from the available context, and it MUST fail with `DmlRepoError` instead of guessing. + +#### Scenario: Reject ambiguous named node lookup +- **WHEN** a caller resolves a node name without explicit DAG context and the available context does not identify a single DAG +- **THEN** the resolution layer raises `DmlRepoError` instructing the caller to provide DAG context + +#### Scenario: Allow unambiguous lookup without explicit dag selector +- **WHEN** a caller resolves a node name without a DAG selector and the available context identifies exactly one matching DAG +- **THEN** the resolution layer returns the matching node `Ref` + +### Requirement: DML delegates selector resolution to the shared resolution layer +The `dml.py` orchestration layer SHALL use shared helpers from `dml_resolution.py` only for selector-to-ref lookup flows and SHALL bypass that layer for workflows that already require exact `Ref` inputs. + +#### Scenario: DML resolves a lookup selector +- **WHEN** DML code needs to resolve a DAG name or node name for a lookup workflow +- **THEN** it uses the shared resolution layer and consumes the returned `Ref` + +#### Scenario: DML bypasses resolution for exact ref input +- **WHEN** a `Dml` workflow already requires an exact `Ref` object +- **THEN** it validates that object directly instead of routing it through selector parsing helpers diff --git a/openspec/specs/dmlops-init-recovery/spec.md b/openspec/specs/dmlops-init-recovery/spec.md new file mode 100644 index 0000000..0e9cccb --- /dev/null +++ b/openspec/specs/dmlops-init-recovery/spec.md @@ -0,0 +1,22 @@ +## Purpose +Define the config-first repository bootstrap recovery behavior that the shared internal `Dml` entrypoint must preserve when project config exists but local DB state is missing. + +## Requirements + +### Requirement: Init recovers missing DB when project config already exists +The system SHALL treat `.dml/config.toml` + missing `.dml/db/` as a recoverable initialization state through the shared internal `Dml` bootstrap workflow. + +#### Scenario: Existing config with missing DB is recovered +- **WHEN** the `Dml` init/bootstrap workflow runs in a project where `.dml/config.toml` exists and `.dml/db/` does not +- **THEN** initialization uses `dml_context` to resolve bootstrap context, creates `.dml/db/`, and completes without requiring manual repository repair + +### Requirement: Recovery mode pulls when a project URI is configured +The system SHALL fetch and check out project bootstrap state during recovery only when resolved configuration includes `remote.project`. + +#### Scenario: Recovery fetches project state when project URI is present +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config includes `remote.project` +- **THEN** it uses resolved remote and project configuration to fetch project state and check out the fetched revision locally + +#### Scenario: Recovery skips fetch and checkout when project URI is absent +- **WHEN** the `Dml` init/bootstrap workflow recovers a missing DB and resolved config has no `remote.project` +- **THEN** it creates local DB state without invoking project fetch, pull, or checkout diff --git a/openspec/specs/execution-admin-controls/spec.md b/openspec/specs/execution-admin-controls/spec.md new file mode 100644 index 0000000..7868aa0 --- /dev/null +++ b/openspec/specs/execution-admin-controls/spec.md @@ -0,0 +1,208 @@ +### Requirement: Manual invalidation SHALL target execution identity +The system SHALL treat cache invalidation as an execution-graph operation. When a user requests invalidation for a cache key, the system SHALL resolve the current execution id from `refs/cache/.json`, compute the reverse caller closure over execution dependencies in a local planning database, and invalidate that execution set. + +The invalidation algorithm SHALL operate as follows: + +1. Initialize `seen = []`, `seen_set = set()`, and `unseen = set()`. +2. For each user-provided cache key, read `refs/cache/.json`. +3. If that cache ref exists, add its `execution_id` to `unseen`. +4. While `unseen` is not empty: +5. Remove one `exec_id` from `unseen`. +6. Read `exec/state/.json`; if it does not exist, continue. +7. Read `cache_key` from that execution state object. +8. Read `refs/cache/.json`; if it does not exist, continue. +9. If that cache ref points to a different `execution_id`, continue. +10. Append `exec_id` to `seen` and add it to `seen_set`. +11. Read callers of `exec_id` from `exec/edges//`. +12. Add `(callers - seen_set)` to `unseen`. +13. After `unseen` is empty, iterate `exec_id` through `reversed(seen)`. +14. For each `exec_id`, write `exec/invalidate/.json` with create-once/CAS semantics. +15. Then delete `refs/cache/.json` with compare-and-swap semantics only if it still points to `exec_id`. + +#### Scenario: Invalidate starts from current cache ref +- **WHEN** a user invalidates cache key `ck1` +- **THEN** the system SHALL read `refs/cache/ck1.json` to determine the current root execution id before planning propagation + +#### Scenario: Historical execution is skipped when cache ref moved +- **WHEN** `exec/state/e1.json` exists but `refs/cache/ck1.json` now points to `e2` instead of `e1` +- **THEN** invalidation SHALL skip `e1` +- **AND** it SHALL NOT add callers of `e1` to the invalidation closure + +### Requirement: Invalidation SHALL write execution tombstones and drop affected cache refs +For every execution id in the invalidation closure, the system SHALL write `exec/invalidate/.json` as an immutable control marker containing `execution_id`, `cache_key`, `requested_by`, and `requested_at`. After planning completes, the system SHALL delete every cache ref whose recorded `execution_id` is in that invalidated set. + +The invalidate tombstone schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `requested_by: str` +- `requested_at: int` + +#### Scenario: Invalidation writes control markers and removes cache pointers +- **WHEN** the local planner computes invalidation closure `A` +- **THEN** the system SHALL create `exec/invalidate/.json` for every execution in `A` +- **AND** it SHALL delete each `refs/cache/.json` whose stored `execution_id` belongs to `A` + +#### Scenario: Cache ref delete is guarded by compare-and-swap +- **WHEN** invalidation reaches commit for execution `e1` +- **AND** `refs/cache/ck1.json` no longer points to `e1` +- **THEN** the system SHALL NOT delete that cache ref + +#### Scenario: Invalidation tombstone stores requester metadata +- **WHEN** the system writes `exec/invalidate/e1.json` +- **THEN** that object SHALL contain `execution_id`, `cache_key`, `requested_by`, and `requested_at` + +### Requirement: Manual cancellation SHALL target index identity +The system SHALL treat cancellation as an index-rooted execution-graph operation keyed by index id. `Dml.runtime.cancel` SHALL lock the target index, atomically move `indexes/.json` to `indexes/.cancelled/.json`, mark the synthetic index execution record `cancel-requested`, traverse the full rooted execution graph, and run a retryable cancellation loop over the rooted candidate executions. + +The cancellation algorithm SHALL operate as follows: + +1. Lock the target index. +2. Move `indexes/.json` to `indexes/.cancelled/.json` atomically. +3. Release the index lock. +4. Ensure `exec/state/.json` exists as the synthetic root state record. +5. Update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user before any descendant cancellation work begins. +6. Traverse the full execution graph rooted at the synthetic root record's `dependencies` and collect `graph := {(caller, callee), ...}`. +7. Define `candidate_set := {callee | (caller, callee) in graph}` and `own_executions := candidate_set.copy()`. +8. While `candidate_set` is not empty, `Dml.runtime.cancel` SHALL run a parallel worker across the current `candidate_set` and log loop diagnostics. +9. For each candidate execution id, attempt to acquire the candidate's `cache_key` lock; if lock acquisition fails, return `None` for that candidate. +10. While holding the lock, reread `exec/state/.json`; if it does not exist, release the lock and return `-1`. +11. While holding the lock, read `active_callers(c)` for the candidate from the global reverse-edge records in S3 and determine current `status`. +12. If `len(active_callers(c) - own_executions) > 0` or the candidate is not in an active status, release the lock and return `-1`. +13. Otherwise, update `exec/state/.json` with compare-and-swap semantics so that `status = "cancel-requested"` and `cancel_requested_by` identifies the requesting user before invoking that candidate's adapter update path with `execution_status = "cancel-requested"`. +14. If the candidate's full adapter chain reaches terminal `cancelled`, update `exec/state/.json` so that `status = "cancelled"`, release the lock, and return `+1`. +15. Otherwise, release the lock and return `None`. +16. After one loop iteration completes, remove every `+1` candidate from `candidate_set` and remove every `-1` candidate from both `candidate_set` and `own_executions`. +17. Repeat until `candidate_set` is empty. +18. After every execution remaining in `own_executions` has status `cancelled`, update `exec/state/.json` so that `status = "cancelled"`. +19. After the rooted graph has been cancelled successfully for the index-owned executions, delete `indexes/.cancelled/.json`. +20. `Dml.runtime.cancel` SHALL return a cancellation statistics object. + +`Dml.runtime.cancel` MAY continue looping indefinitely when a candidate execution can only be fully cancelled by adapters or runtimes that are unreachable from the index runtime process. In that case, the runtime SHALL continue persisting and observing `cancel-requested` state but is not required to guarantee autonomous completion. + +The cancellation statistics object SHALL have the following schema: + +- `index_id: str` +- `iterations: int` +- `graph_edges: int` +- `candidate_count: int` +- `own_execution_count: int` +- `cancelled_count: int` +- `dropped_count: int` +- `lock_retry_count: int` + +#### Scenario: Runtime cancel freezes the index before planning +- **WHEN** a user cancels index `idx1` +- **THEN** the system SHALL atomically move `indexes/idx1.json` to `indexes/.cancelled/idx1.json` under lock before cancellation planning begins + +#### Scenario: Rooted cancellation starts from the index root dependencies +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `status = "cancel-requested"` +- **AND** it SHALL initialize rooted graph traversal from `exec/state/idx1.json` dependencies rather than from `{idx1}` itself + +#### Scenario: Root cancellation is recorded before descendant work +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL persist `exec/state/idx1.json` with `status = "cancel-requested"` before counting callers for descendants or invoking any adapter cancellation updates + +#### Scenario: Cancellation discovers the full rooted graph before processing +- **WHEN** a user cancels index `idx1` +- **THEN** the runtime SHALL traverse the full execution graph reachable from `exec/state/idx1.json` dependencies +- **AND** it SHALL collect caller-callee edges for the full rooted graph before processing cancellation decisions for candidate executions + +#### Scenario: Candidate and ownership sets are initialized from rooted traversal +- **WHEN** rooted graph traversal for index `idx1` produces caller-callee graph `G` +- **THEN** the runtime SHALL derive `candidate_set` from the callee nodes in `G` +- **AND** it SHALL initialize `own_executions` as a copy of `candidate_set` + +#### Scenario: Caller ownership uses the global reverse-edge set +- **WHEN** index `A` and unrelated index `B` both call execution `X` +- **THEN** `callers(X)` SHALL include both `A` and `B` +- **AND** `cancel(A)` SHALL NOT cancel `X` + +#### Scenario: Recursive ownership remains cancellable +- **WHEN** index `A` calls `X` and `Y` +- **AND** `Y` calls `X` +- **AND** the global caller set is `callers(X) = {A, Y}` +- **THEN** `cancel(A)` MAY cancel `X` + +#### Scenario: Candidate lock contention yields retry +- **WHEN** the loop examines candidate execution `e1` +- **AND** `e1`'s cache-key lock cannot be acquired +- **THEN** the worker SHALL return `None` +- **AND** the loop SHALL leave `e1` in `candidate_set` for retry + +#### Scenario: Cancellation loop reports diagnostics +- **WHEN** `Dml.runtime.cancel` runs one or more cancellation loop iterations +- **THEN** it SHALL emit diagnostics describing loop progress + +#### Scenario: Cancellation returns loop statistics +- **WHEN** `Dml.runtime.cancel` completes for index `idx1` +- **THEN** it SHALL return cancellation statistics +- **AND** those statistics SHALL include the number of loop iterations + +#### Scenario: Cancellation statistics report rooted graph size +- **WHEN** rooted graph traversal for `idx1` collects 7 caller-callee edges and 4 candidate executions +- **THEN** the returned statistics SHALL include `graph_edges = 7` +- **AND** they SHALL include `candidate_count = 4` + +#### Scenario: Cancellation statistics report loop outcomes +- **WHEN** one cancellation run for `idx1` cancels 2 executions, drops 1 execution from ownership, and retries 3 lock-contention events +- **THEN** the returned statistics SHALL include `cancelled_count = 2` +- **AND** they SHALL include `dropped_count = 1` +- **AND** they SHALL include `lock_retry_count = 3` + +#### Scenario: Cancellation statistics identify the target index +- **WHEN** `Dml.runtime.cancel` completes for index `idx1` +- **THEN** the returned statistics SHALL include `index_id = "idx1"` + +#### Scenario: Candidate cancellation runs only without active callers +- **WHEN** the planner examines candidate execution `e1` +- **AND** `e1` still has at least one active caller outside `own_executions` +- **THEN** the runtime SHALL NOT mark `exec/state/e1.json` as `cancel-requested` +- **AND** it SHALL NOT invoke adapter cancellation for `e1` +- **AND** it SHALL NOT mark `exec/state/e1.json` as `cancelled` +- **AND** it SHALL remove `e1` from both `candidate_set` and `own_executions` for the current cancellation run + +#### Scenario: Candidate cancel request is recorded before cancellation work +- **WHEN** the planner examines candidate execution `e1` +- **AND** `e1` has no active callers outside `own_executions` +- **THEN** the runtime SHALL persist `exec/state/e1.json` with `status = "cancel-requested"` before invoking adapter cancellation for `e1` + +#### Scenario: Active callers are rechecked under lock +- **WHEN** execution `e1` is in the current `candidate_set` +- **AND** the runtime has acquired `e1`'s cache-key lock for the current loop iteration +- **THEN** it SHALL recompute `e1`'s active-caller set before marking `cancel-requested` or invoking adapter cancellation + +#### Scenario: Terminal cancelled waits for the full adapter chain +- **WHEN** execution `e1` has no active callers outside `own_executions` +- **AND** one adapter layer reports cancellation progress before outer adapter cleanup has finished +- **THEN** the runtime SHALL keep `exec/state/e1.json` at `status = "cancel-requested"` +- **AND** the index-cancellation runtime SHALL NOT persist `status = "cancelled"` until the full adapter chain has completed cancellation handling + +#### Scenario: Unreachable remote-only adapter chain can stall cancellation +- **WHEN** execution `e1` delegates cancellation work to descendant execution `e2` +- **AND** completing cancellation for `e2` requires bespoke adapters or a runtime unreachable from the index runtime process +- **THEN** `Dml.runtime.cancel` MAY continue retrying without converging to terminal `cancelled` +- **AND** it SHALL keep the relevant execution records at `status = "cancel-requested"` until another runtime handles cancellation or the user interrupts the loop + +#### Scenario: Cancelled execution still does not prune graph traversal +- **WHEN** execution `e1` is already `cancelled` +- **AND** `e1` has recorded dependencies +- **THEN** the runtime SHALL NOT invoke adapter cancellation for `e1` +- **AND** it SHALL still include `e1`'s descendants in rooted graph traversal for index cancellation + +#### Scenario: Successful cancellation removes only the candidate from the retry set +- **WHEN** the loop worker for execution `e1` returns `+1` +- **THEN** the runtime SHALL remove `e1` from `candidate_set` +- **AND** it SHALL keep `e1` in `own_executions` + +#### Scenario: Failed ownership removes candidate from both sets +- **WHEN** the loop worker for execution `e1` returns `-1` +- **THEN** the runtime SHALL remove `e1` from `candidate_set` +- **AND** it SHALL remove `e1` from `own_executions` + +#### Scenario: Cancellation sweep marks the synthetic root cancelled after graph completion +- **WHEN** the runtime completes the retry loop for index `idx1` +- **AND** every execution remaining in `own_executions` has status `cancelled` +- **THEN** it SHALL update `exec/state/idx1.json` so that `status = "cancelled"` +- **AND** it SHALL delete `indexes/.cancelled/.json` diff --git a/openspec/specs/execution-call-edges/spec.md b/openspec/specs/execution-call-edges/spec.md new file mode 100644 index 0000000..713f22b --- /dev/null +++ b/openspec/specs/execution-call-edges/spec.md @@ -0,0 +1,51 @@ +### Requirement: Call-edge records SHALL represent realized rooted dependencies +The runtime SHALL record only realized rooted dependencies. An edge SHALL mean that caller id `caller_execution_id` was observed to depend on callee execution `callee_execution_id` during runtime execution, even if that dependency is discovered during a later `start_fn` poll cycle. The caller id MAY be either a normal execution id or a synthetic root index id. + +#### Scenario: Dependency discovered after initial launch still creates edge +- **WHEN** execution `e0` does not know about callee `e1` on its first poll but discovers that dependency on a later poll +- **THEN** the runtime SHALL create the edge record for `e1 <- e0` when that dependency becomes known + +#### Scenario: Repeated observation does not require a second edge fact +- **WHEN** execution `e0` rediscovers an existing dependency on `e1` +- **THEN** the runtime SHALL continue to treat `e1 <- e0` as one canonical edge fact + +#### Scenario: Index root creates rooted dependency edge +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL treat `e1 <- idx1` as one canonical rooted edge fact + +### Requirement: Runtime SHALL persist canonical edge records by callee execution id +The runtime SHALL persist each rooted dependency as the immutable object `exec/edges//.json`. The payload SHALL include only `caller_execution_id` and `callee_execution_id`. + +#### Scenario: Edge record is written at canonical path +- **WHEN** execution `e0` discovers a dependency on execution `e1` +- **THEN** the runtime SHALL write `exec/edges/e1/e0.json` +- **AND** that object SHALL contain JSON with `caller_execution_id = "e0"` and `callee_execution_id = "e1"` + +#### Scenario: Reverse lineage query lists callers by callee execution id +- **WHEN** an invalidation planner needs all callers of execution `e1` +- **THEN** it SHALL obtain them by reading the objects under `exec/edges/e1/` + +#### Scenario: Index root uses the same canonical edge namespace +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL write `exec/edges/e1/idx1.json` +- **AND** that object SHALL contain JSON with `caller_execution_id = "idx1"` and `callee_execution_id = "e1"` + +### Requirement: Live caller edges SHALL be caller-owned and removable +The runtime SHALL treat `exec/edges//.json` as a live caller edge owned by the caller runtime. The caller runtime that created the edge SHALL be allowed to remove that edge when it cancels or otherwise stops being a caller of the callee execution. + +#### Scenario: Caller cancellation removes its own live edge +- **WHEN** caller execution `e0` is cancelled after creating edge `exec/edges/e1/e0.json` +- **THEN** the runtime handling `e0` cancellation SHALL be allowed to remove that edge + +#### Scenario: Other callers preserve callee liveness +- **WHEN** caller `e0` removes its edge to callee `e1` +- **AND** another live edge for `e1` still exists +- **THEN** the runtime SHALL continue to treat `e1` as having live callers + +### Requirement: Live caller edges and spawned execution ids SHALL remain distinct +The runtime SHALL use live caller edges for reverse-lineage invalidation and orphan detection, and SHALL use `execution_record.spawned_execution_ids` for cancellation traversal. Removal of a live caller edge SHALL NOT remove the callee from the caller's historical spawned execution summary. + +#### Scenario: Removing live edge preserves historical cancellation dependency +- **WHEN** caller `e0` removes its live edge to callee `e1` during cancellation +- **THEN** `e1` MAY still remain in `e0`'s `spawned_execution_ids` +- **AND** the runtime SHALL continue treating those structures as distinct sources of truth diff --git a/openspec/specs/execution-state/spec.md b/openspec/specs/execution-state/spec.md new file mode 100644 index 0000000..c7dc440 --- /dev/null +++ b/openspec/specs/execution-state/spec.md @@ -0,0 +1,115 @@ +### Requirement: S3-backed mutex lock file +The system SHALL store a lock file at `{remote_root_prefix}/exec/{cache_key}.json` containing only `{lock_token: str, lock_expires_ts: float}`. No status, metadata, or job-specific fields. + +#### Scenario: Lock file written to correct S3 key +- **WHEN** `ExecutionState(cache_key, remote_root="s3://bucket/prefix").lock()` succeeds +- **THEN** a JSON object is written to `s3://bucket/prefix/exec/{cache_key}.json` + +#### Scenario: No DynamoDB dependency +- **WHEN** any `ExecutionState` method is called +- **THEN** no DynamoDB client is created and `DML_DYNAMODB_TABLE` is not read + +### Requirement: Lock acquired via create-if-absent +The system SHALL acquire the lock by PUT with `If-None-Match: *`. If the object already exists and its `lock_expires_ts` has not passed, `lock()` SHALL return `False`. If the existing lock is expired, the system SHALL DELETE it and re-PUT, returning `True`. + +#### Scenario: Lock acquired when no file exists +- **WHEN** `lock()` is called and no lock file exists at the key +- **THEN** the file is created with a fresh `lock_token` and `lock_expires_ts`, and `True` is returned + +#### Scenario: Lock refused when held and not expired +- **WHEN** `lock()` is called and a non-expired lock file exists +- **THEN** `False` is returned and the file is unchanged + +#### Scenario: Expired lock is stolen +- **WHEN** `lock()` is called and an expired lock file exists +- **THEN** the old file is deleted, a new one is created, and `True` is returned + +#### Scenario: Concurrent create conflict returns False +- **WHEN** the `If-None-Match: *` PUT returns `412 PreconditionFailed` +- **THEN** `False` is returned without raising + +### Requirement: Lock released via DELETE +The system SHALL release the lock by DELETE of the lock file. No updates to the file are ever made. + +#### Scenario: Unlock deletes the file +- **WHEN** `unlock()` is called by the lock holder +- **THEN** the lock file is deleted from S3 + +#### Scenario: Unlock is idempotent +- **WHEN** `unlock()` is called and the file does not exist +- **THEN** no error is raised + +### Requirement: start_fn mutex-gated adapter dispatch +`IndexOps.start_fn` SHALL implement the following flow on every call: +1. Check cache — return node if hit. +2. Attempt `lock()` — return `None` if failed. +3. Recheck cache — if hit, delete lock file and return node. +4. Call adapter (must return quickly); adapter stdout carries `{status, dag_id?, error?}`. +5. On `succeeded`: publish result to cache, delete lock file. +6. On `failed`: delete lock file, raise. +7. On `running`: delete lock file, return `None`. + +#### Scenario: Cache hit before lock returns node immediately +- **WHEN** `start_fn` is called and the cache already contains a result +- **THEN** the node is returned without acquiring the lock + +#### Scenario: Lock contention returns None +- **WHEN** `start_fn` is called and another process holds the lock +- **THEN** `None` is returned so the caller retries + +#### Scenario: Cache hit after lock cleans up and returns node +- **WHEN** `start_fn` acquires the lock but finds a cache hit on recheck +- **THEN** the lock file is deleted and the cached node is returned + +#### Scenario: Adapter success publishes cache and releases lock +- **WHEN** the adapter returns `status: succeeded` with a `dag_id` +- **THEN** the result is published to cache and the lock file is deleted + +#### Scenario: Adapter failure releases lock and raises +- **WHEN** the adapter returns `status: failed` +- **THEN** the lock file is deleted and a `DmlRepoError` is raised + +#### Scenario: Adapter still running releases lock and returns None +- **WHEN** the adapter returns `status: running` +- **THEN** the lock file is deleted and `None` is returned + +### Requirement: ExecutionState constructed from remote_root +The system SHALL accept `remote_root: str` as a required configuration parameter for `ExecutionState`. Call sites that construct `ExecutionState` MUST provide a valid remote root explicitly and MUST NOT rely on optional remote-root values or `None` defaults. + +#### Scenario: remote_root parsed to bucket and prefix +- **WHEN** `ExecutionState(cache_key, remote_root="s3://my-bucket/my/prefix")` is constructed +- **THEN** lock operations target `s3://my-bucket/my/prefix/exec/{cache_key}.json` + +#### Scenario: call site provides explicit remote_root +- **WHEN** code constructs `ExecutionState` for a remote-backed execution flow +- **THEN** that call site passes a concrete `remote_root: str` value at construction time + +#### Scenario: optional or None remote_root defaults are not relied on +- **WHEN** a remote-backed execution flow constructs `ExecutionState` +- **THEN** it does not rely on an optional remote-root parameter or a `None` default to supply remote configuration + +### Requirement: Caller-owned launch state SHALL be serialized by cache-key lock +The runtime SHALL persist caller-owned `launch_state` for each execution attempt separately from lifecycle state. `launch_state` SHALL contain `execution_id`, `cache_key`, `resume_state`, and `created_at`. The runtime SHALL create and update `launch_state` only while holding the coordination lock for the corresponding `cache_key`. + +#### Scenario: First running launch persists launch state under lock +- **WHEN** `start_fn` launches a new execution and receives a `running` adapter result with durable resume data +- **THEN** it SHALL persist `launch_state` containing `execution_id`, `cache_key`, `resume_state`, and `created_at` +- **AND** it SHALL do so while holding the lock for that `cache_key` + +#### Scenario: Resume reads launch state under lock +- **WHEN** `start_fn` resumes an execution referenced by `active/` +- **THEN** it SHALL read that execution's `launch_state` while holding the lock for that `cache_key` +- **AND** it SHALL pass `resume_state` from `launch_state` to the adapter + +### Requirement: Cancellation orphaning SHALL remove current-execution ownership under lock +When cancellation leaves an execution with no remaining live callers, the runtime SHALL acquire the coordination lock for that execution's `cache_key`, recheck that no live callers remain, ensure the execution is not terminal, and remove `active/` before marking cancellation intent on lifecycle state. + +#### Scenario: Orphaned callee loses active pointer before cancellation lifecycle update +- **WHEN** cancellation removes the last live caller edge for callee execution `e1` +- **THEN** the runtime SHALL lock the coordination key for `e1`'s `cache_key` +- **AND** it SHALL delete `active/` before setting the callee lifecycle to a `cancel-*` value + +#### Scenario: New caller relaunches after detached cancellation +- **WHEN** a later caller computes the same `cache_key` after the prior execution was cancellation-detached and `active/` is absent +- **THEN** the runtime SHALL treat the computation as having no current execution +- **AND** it SHALL create a fresh execution attempt instead of resuming the detached one diff --git a/openspec/specs/executor-cancellation/spec.md b/openspec/specs/executor-cancellation/spec.md new file mode 100644 index 0000000..8a4a8e7 --- /dev/null +++ b/openspec/specs/executor-cancellation/spec.md @@ -0,0 +1,36 @@ +### Requirement: Executors SHALL handle `cancel-pending` as an update step +When the runtime invokes an executor with `execution_status = "cancel-pending"`, the executor SHALL treat that invocation as a cancellation update rather than as a fresh launch. Executors that normally dispatch to `runnable.sub` during update SHALL continue to dispatch to `runnable.sub` once in cancellation mode before performing executor-owned cleanup. Executors that do not normally dispatch to `runnable.sub` during update SHALL cancel their own external resources directly. + +#### Scenario: Update-dispatch executor forwards cancellation update +- **WHEN** an executor that normally calls `runnable.sub` on update receives `execution_status = "cancel-pending"` +- **THEN** it SHALL issue its normal update-time sub-dispatch once before executor-owned cleanup + +#### Scenario: Detached-work executor cancels backend directly +- **WHEN** an executor that does not normally call `runnable.sub` on update receives `execution_status = "cancel-pending"` +- **THEN** it SHALL cancel or tear down its own external work without invoking `runnable.sub` + +### Requirement: Executors SHALL tear down external resources during cancellation +Executor-owned cancellation SHALL tear down external resources and SHALL NOT mutate the persisted execution record `state`. Script execution SHALL terminate the supervisor-managed process tree and remove its work directory. Docker execution SHALL stop and remove the container and SHALL remove any temporary loaded image. Batch execution SHALL cancel or terminate the Batch job as appropriate and SHALL deregister the temporary job definition. CloudFormation execution SHALL initiate rollback or cancellation of the stack operation and return without waiting for the rollback to finish. SSH execution SHALL return the nested adapter's cancellation result and SHALL NOT create additional remote wrapper state. + +#### Scenario: Batch cancellation tears down Batch resources +- **WHEN** the Batch executor receives `execution_status = "cancel-pending"` +- **THEN** it SHALL cancel or terminate the Batch job and deregister the temporary job definition + +#### Scenario: CloudFormation cancellation returns quickly with rollback context +- **WHEN** the CloudFormation executor receives `execution_status = "cancel-pending"` +- **THEN** it SHALL start rollback or cancellation of the stack operation +- **AND** it SHALL return promptly with enough stack context for the caller to identify the affected stack + +### Requirement: Successful cancel updates SHALL report `cancel-detached` +When an executor processes a cancel update without transport or runtime exceptions, it SHALL return `status = "cancel-detached"` even if backend cleanup or rollback continues asynchronously. The runtime cancellation workflow SHALL treat that result as confirmation that the cancel update was handled and ownership was detached rather than as a successful DAG execution result. + +#### Scenario: Cancel update reports success after teardown request +- **WHEN** an executor successfully processes a `cancel-pending` update +- **THEN** it SHALL return `status = "cancel-detached"` + +### Requirement: Executor cancellation SHALL honor detached completion semantics +Executors SHALL interpret `cancel-detached` as a control-plane completion signal rather than proof that backend cleanup has already finished. Executors that initiate asynchronous backend rollback or shutdown SHALL still return promptly once they have issued the required cancellation work. + +#### Scenario: Asynchronous backend rollback still returns detached status +- **WHEN** an executor starts backend rollback or shutdown that continues asynchronously +- **THEN** it SHALL still return `status = "cancel-detached"` after issuing that work successfully diff --git a/openspec/specs/generated-dml-cli/spec.md b/openspec/specs/generated-dml-cli/spec.md new file mode 100644 index 0000000..fd7efa2 --- /dev/null +++ b/openspec/specs/generated-dml-cli/spec.md @@ -0,0 +1,71 @@ +### Requirement: CLI surface is generated from the public `Dml` API +The system SHALL generate the `dml` command tree from the public `Dml` class and its public namespaces rather than from a hand-maintained set of per-command parser modules. + +#### Scenario: Top-level public methods become commands +- **WHEN** a public callable exists on `Dml` and its parameters are CLI-generatable +- **THEN** the CLI exposes a top-level command for that method + +#### Scenario: Public namespaces become command groups +- **WHEN** a public namespace object is reachable from `Dml` +- **THEN** the CLI exposes that namespace as a subcommand group and exposes its public CLI-generatable methods as leaf commands + +### Requirement: CLI only exposes methods with generatable parameter types +The CLI SHALL omit any public `Dml` or namespace method whose parameter annotations cannot be generated from command-line input. + +#### Scenario: Unsupported parameter type omits method +- **WHEN** a public method includes a parameter annotated with an unsupported type such as `Any` +- **THEN** the CLI does not expose that method + +#### Scenario: Supported typed method remains exposed +- **WHEN** a public method uses only supported parameter families such as `Ref`, `int`, `float`, `str`, `Literal`, optionals of those types, or JSON-backed container types +- **THEN** the CLI exposes that method + +### Requirement: Generated arguments follow signature-driven CLI rules +The CLI SHALL derive argument shape from runtime-visible signatures, defaults, and resolved annotations. + +#### Scenario: Required parameters become positional arguments +- **WHEN** a public method parameter has no default value +- **THEN** the generated CLI exposes it as a positional argument using the snake_case parameter name + +#### Scenario: Defaulted parameters become options +- **WHEN** a public method parameter has a default value +- **THEN** the generated CLI exposes it as an option using the kebab-case parameter name + +#### Scenario: Boolean defaults preserve behavior +- **WHEN** a boolean parameter default is `False` +- **THEN** the generated CLI exposes a positive `--` flag +- **AND** when a boolean parameter default is `True` +- **THEN** the generated CLI exposes a negative `--no-` flag + +### Requirement: Generated parsing uses annotations and documented help metadata +The CLI SHALL parse supported argument types from resolved annotations and SHALL use docstrings plus `Annotated` metadata to generate command help. + +#### Scenario: Literal annotations constrain choices +- **WHEN** a parameter is annotated with `Literal[...]` +- **THEN** the generated CLI restricts accepted values to those literals + +#### Scenario: Ref annotations parse as refs +- **WHEN** a parameter is annotated as `Ref` +- **THEN** the generated CLI parses the input string into a `Ref` value before calling the method + +#### Scenario: Positional arguments are documented in help text +- **WHEN** a generated command includes positional arguments +- **THEN** the command help includes positional argument documentation derived from annotations or doc metadata rather than relying only on default `argparse` positional rendering + +### Requirement: Overload ambiguity uses one runtime signature +The CLI SHALL generate commands from one runtime-visible signature even when overload declarations describe multiple static variants. + +#### Scenario: Overloaded method still generates one command +- **WHEN** a public method has overload declarations and one implementation signature +- **THEN** the CLI uses the implementation signature for generation and does not create multiple command variants + +### Requirement: Generated CLI output and errors are JSON +The generated CLI SHALL emit JSON for successful results and normalized failures. + +#### Scenario: Successful command emits JSON +- **WHEN** a generated CLI command returns a value +- **THEN** the CLI serializes that value as JSON using the standard typed-leaf encoder + +#### Scenario: Failed command emits structured JSON error +- **WHEN** generated command execution raises an exception +- **THEN** the CLI emits a structured JSON error payload instead of an unstructured traceback diff --git a/openspec/specs/git-like-commit-ops/spec.md b/openspec/specs/git-like-commit-ops/spec.md new file mode 100644 index 0000000..d4bdb37 --- /dev/null +++ b/openspec/specs/git-like-commit-ops/spec.md @@ -0,0 +1,230 @@ +## Purpose +Define the git-like repository workflow contracts for revision resolution, checkout, merge, revert, DAG checkout, and shared `Dml` orchestration over commit/head/remote subsystems. + +## Requirements + +### Requirement: Merge advances current head +The system SHALL merge another commit or branch into the current branch by creating a merge commit when needed and advancing the current head. + +#### Scenario: Merge non-conflicting branch +- **WHEN** a user merges a branch whose tree changes do not conflict with the current branch +- **THEN** the system creates a merge commit with both commits as parents and advances the current head to that merge commit + +#### Scenario: Merge fast-forward +- **WHEN** the current branch head is an ancestor of the merged commit +- **THEN** the system advances the current head to the merged commit without creating an unnecessary merge commit + +### Requirement: Merge detects DAG-name conflicts +The system SHALL reject merges where both sides changed the same DAG name to different DAG refs since the merge base. + +#### Scenario: Conflicting DAG name +- **WHEN** the merge base has `train -> dag:a`, the current branch has `train -> dag:b`, and the merged branch has `train -> dag:c` +- **THEN** merge fails with a conflict naming `train` and does not advance the current head + +### Requirement: Revert commit creates inverse commit +The system SHALL revert a commit by applying the inverse of that commit's tree diff to the current branch as a new commit. + +A revert SHALL only modify a DAG name when the current tree still matches the post-commit value introduced by the reverted commit. If the current tree no longer matches that post-commit value, revert SHALL fail with a conflict and SHALL NOT advance the current branch. + +#### Scenario: Revert added DAG +- **WHEN** the reverted commit added DAG name `train` +- **THEN** the revert commit removes `train` from the current branch tree if safe to apply + +#### Scenario: Revert changed DAG +- **WHEN** the reverted commit changed `train` from `dag:a` to `dag:b` +- **THEN** the revert commit changes `train` back to `dag:a` if the current tree still permits safe application + +#### Scenario: Revert changed DAG conflict +- **WHEN** the reverted commit changed `train` from `dag:a` to `dag:b` and the current tree has `train -> dag:c` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +#### Scenario: Revert added DAG conflict +- **WHEN** the reverted commit added `train -> dag:a` and the current tree has `train -> dag:b` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +#### Scenario: Revert removed DAG conflict +- **WHEN** the reverted commit removed `train -> dag:a` and the current tree already has `train -> dag:b` +- **THEN** revert fails with a conflict naming `train` and does not advance the current branch + +### Requirement: DAG checkout from revision +The system SHALL support checking out one DAG from a resolved revision into the current branch tree and committing that change. + +#### Scenario: Checkout DAG with same name +- **WHEN** `dml dag checkout HEAD~1 train` resolves `HEAD~1` to a commit containing `train -> dag:a` +- **THEN** the system creates a new commit whose tree contains `train -> dag:a` and advances the current head + +#### Scenario: Checkout DAG with alias +- **WHEN** `dml dag checkout origin/main train --as baseline_train` resolves `origin/main` to a commit containing `train -> dag:a` +- **THEN** the system creates a new commit whose tree contains `baseline_train -> dag:a` and advances the current head + +#### Scenario: Checkout refuses overwrite by default +- **WHEN** the target name already exists with a different DAG ref and `--replace` is not provided +- **THEN** DAG checkout fails without creating a commit or advancing the current head + +#### Scenario: Checkout replaces when requested +- **WHEN** the target name already exists with a different DAG ref and `--replace` is provided +- **THEN** DAG checkout creates a new commit with the target name pointing to the checked-out DAG ref + +### Requirement: Revision resolution +The system SHALL resolve revision values used by git-like commands to concrete local commit refs without performing network fetches. `HEAD` and ancestry expressions based on `HEAD` SHALL resolve through the repository's `.dml/HEAD` file. + +#### Scenario: Resolve branch shorthand +- **WHEN** a command receives `main` as a revision +- **THEN** the system resolves it as local branch `main` + +#### Scenario: Resolve remote-tracking branch shorthand +- **WHEN** a command receives `origin/main` as a revision +- **THEN** the system resolves it through the configured remote URI to local tracking ref `dml:///#main` + +#### Scenario: Resolve fetched DML branch URI +- **WHEN** a command receives `dml://alice/tools#main` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Resolve fetched DML tag URI +- **WHEN** a command receives `dml://alice/tools@v1.0` as a revision and that tracking ref exists locally +- **THEN** the system resolves it to the commit stored for that tracking ref + +#### Scenario: Unfetched DML URI is not fetched implicitly +- **WHEN** a command receives `dml://alice/tools#main` as a revision and no matching local tracking ref exists +- **THEN** the command fails without contacting the remote + +#### Scenario: Resolve first-parent ancestry from HEAD file +- **WHEN** a command receives `HEAD~2` as a revision +- **THEN** the system resolves `HEAD` through `.dml/HEAD` and walks two first-parent steps from that resolved commit + +#### Scenario: Resolve local tag shorthand +- **WHEN** a command receives `v1.0` as a revision and `v1.0` resolves as a local tag +- **THEN** the system resolves it to the commit referenced by that tag + +### Requirement: Checkout repository state from revision +The system SHALL support checking out repository state from a resolved revision and SHALL distinguish branch-attached from detached checkouts by rewriting `.dml/HEAD`. + +#### Scenario: Checkout branch attaches runtime +- **WHEN** `dml checkout main` resolves `main` to a local branch +- **THEN** the system writes `.dml/HEAD` as `ref: refs/local/heads/main` and reports branch-attached checkout + +#### Scenario: Checkout tag detaches runtime +- **WHEN** `dml checkout v1.0` resolves `v1.0` to a tag target commit +- **THEN** the system writes `.dml/HEAD` as that detached commit and reports detached checkout at that commit + +#### Scenario: Checkout commit expression detaches runtime +- **WHEN** `dml checkout HEAD~1` resolves to a concrete commit +- **THEN** the system writes `.dml/HEAD` as that detached commit and reports detached checkout at that commit + +#### Scenario: Commit while detached does not advance branch or HEAD +- **WHEN** a user checks out a non-branch revision and then runs commit flow through `IndexOps.commit` +- **THEN** the system may create the new detached commit but does not advance any branch head and does not rewrite `.dml/HEAD` + +### Requirement: Mutable project workflows require an attached branch +The system SHALL require `.dml/HEAD` to be attached to a local branch before default project workflows mutate branch history or publish a branch tip. + +#### Scenario: Push uses attached HEAD branch by default +- **WHEN** `.dml/HEAD` is attached to local branch `foo` and the user runs project push without an explicit branch override +- **THEN** the system pushes local branch `foo` to remote branch URI `dml:///#foo` + +#### Scenario: Pull requires attached HEAD +- **WHEN** `.dml/HEAD` is detached and the user runs project pull without an explicit mutable branch target +- **THEN** the command fails instead of selecting a branch from config or environment + +#### Scenario: Merge requires attached HEAD when defaulting destination +- **WHEN** `.dml/HEAD` is detached and the user runs a merge workflow that would otherwise target the current branch +- **THEN** the command fails because the current checkout is not a mutable branch target + +#### Scenario: Checkout unresolved remote URI fails locally +- **WHEN** `dml checkout dml://alice/tools#main` is requested and no local tracking ref exists for that URI +- **THEN** checkout fails without implicit fetch and reports that the revision cannot be resolved locally + +### Requirement: DAG removal remains explicit +The system SHALL remove DAG names from the current branch tree only through an explicit DAG removal command, not through DAG checkout of an absent source. + +#### Scenario: Checkout absent DAG +- **WHEN** DAG checkout targets a commit that does not contain the requested DAG name +- **THEN** the command fails without deleting the target name from the current branch + +### Requirement: Branch-targeted commit workflows update branches through HeadOps +The system SHALL perform branch advancement in git-like commit workflows through `HeadOps` public methods rather than direct head storage access. + +#### Scenario: Merge updates branch through HeadOps +- **WHEN** a branch-targeted merge needs to fast-forward or store a merge commit +- **THEN** the workflow advances the branch through `HeadOps` using the expected current commit and the new commit + +#### Scenario: Revert updates branch through HeadOps +- **WHEN** a branch-targeted revert creates a new commit +- **THEN** the workflow advances the branch through `HeadOps` rather than writing the head object directly + +#### Scenario: DAG checkout updates branch through HeadOps +- **WHEN** DAG checkout creates a new commit on a branch +- **THEN** the workflow advances the branch through `HeadOps` rather than writing the head object directly + +### Requirement: Repository inspection workflows resolve revisions locally +The system SHALL provide repository inspection workflows for `show`, `log`, and `diff` that resolve revisions locally without performing implicit network fetches. + +#### Scenario: Show resolves revision locally +- **WHEN** a user runs `dml show origin/main` +- **THEN** the system resolves `origin/main` through existing local tracking state +- **AND** it does not contact the remote automatically + +#### Scenario: Diff resolves both revisions locally +- **WHEN** a user runs `dml diff dml://alice/demo#main HEAD` +- **THEN** the system resolves both revisions from local state only + +### Requirement: Branch listing exposes remote-tracking branches +The system SHALL support listing locally tracked remote branches for git-like branch inspection. + +#### Scenario: Branch remote lists tracked refs +- **WHEN** a user runs `dml branch --remote` +- **THEN** the system returns the set of locally tracked remote branch selectors + +### Requirement: Repository status reports current DAG map and live indexes +The system SHALL provide a repository status workflow that reports the current HEAD state, local branches, the DAG map for the current revision, and live indexes. + +#### Scenario: Status reports attached head +- **WHEN** HEAD is attached to branch `main` and a user runs `dml status` +- **THEN** the response reports attached head state for `main` +- **AND** includes the DAG map for the commit selected by that head + +#### Scenario: Status reports detached head +- **WHEN** HEAD is detached and a user runs `dml status` +- **THEN** the response reports detached head state and the current commit + +### Requirement: Show returns commit delta over DAG namespace +The system SHALL compute commit-introduced change for `dml show` as DAG-map additions, removals, and updates between the selected commit tree and its base tree. + +#### Scenario: Show detects DAG addition +- **WHEN** a commit introduces `train -> dag:a` where the base tree had no `train` +- **THEN** `dml show` reports `train` under `change.added` + +#### Scenario: Show detects DAG update +- **WHEN** a commit changes `train` from `dag:a` to `dag:b` +- **THEN** `dml show` reports `train` under `change.updated` with `before` and `after` + +#### Scenario: Show detects DAG removal +- **WHEN** a commit removes `train -> dag:a` +- **THEN** `dml show` reports `train` under `change.removed` + +### Requirement: Git-like project workflows are owned by `Dml` orchestration +Git-like project command workflows SHALL be available through the shared internal `Dml` orchestration boundary, which coordinates commit, head, and remote operations while delegating concrete repository actions to lower-level ops classes. + +#### Scenario: Pull executes through Dml workflow +- **WHEN** a caller invokes project pull with remote target, branch target, and user context +- **THEN** `Dml` obtains project and remote context through `dml_context`, resolves any fuzzy selectors through its fuzzy-resolution submodule, performs remote synchronization, and applies merge behavior through internal ops + +#### Scenario: Push executes through Dml workflow +- **WHEN** a caller invokes project push with remote target and push options +- **THEN** `Dml` obtains project and remote context through `dml_context`, performs project-aware remote push behavior through the relevant ops classes, and returns the push result through the shared boundary + +#### Scenario: Revert executes through Dml workflow +- **WHEN** a caller invokes project revert with revision, branch target, and user context +- **THEN** `Dml` resolves the revision through its fuzzy-resolution submodule and performs revert behavior through `CommitOps` + +#### Scenario: Checkout executes through Dml workflow +- **WHEN** a caller invokes repository checkout with a revision value +- **THEN** `Dml` resolves the revision through its fuzzy-resolution submodule and performs attached-vs-detached checkout behavior through the relevant ops classes + +#### Scenario: Init runs through Dml-owned project setup +- **WHEN** a caller invokes repository init/bootstrap behavior +- **THEN** `Dml` initializes project state under `.dml/` in the current location through the shared internal boundary instead of requiring a separate bootstrap entrypoint + +#### Scenario: Init recovers config-first partial state +- **WHEN** `.dml/config.toml` exists but `.dml/db/` is missing at init time +- **THEN** the Dml-owned init workflow uses `dml_context` to resolve bootstrap context, creates the missing DB state, and continues bootstrap behavior through the relevant ops classes diff --git a/openspec/specs/headops-pointer-management/spec.md b/openspec/specs/headops-pointer-management/spec.md new file mode 100644 index 0000000..8d9a389 --- /dev/null +++ b/openspec/specs/headops-pointer-management/spec.md @@ -0,0 +1,93 @@ +## ADDED Requirements + +### Requirement: HeadOps owns branch and index pointer persistence +The system SHALL route all branch and index storage creation, lookup, update, listing, and deletion through `HeadOps` public methods. + +#### Scenario: Non-HeadOps caller needs branch commit +- **WHEN** an internal caller needs the commit for a branch +- **THEN** it obtains that commit through a `HeadOps` public method instead of reading a `Head` object or head ref directly + +#### Scenario: Non-HeadOps caller needs index commit +- **WHEN** an internal caller needs the commit for an index +- **THEN** it obtains that commit through a `HeadOps` public method instead of reading an `Index` object or index ref directly + +### Requirement: HeadOps hides head and index refs from callers +The system SHALL keep branch and index refs internal to `HeadOps` and SHALL expose branch names, opaque index ids, and commit refs to non-`HeadOps` callers. + +#### Scenario: Branch-targeted workflow uses branch name +- **WHEN** an internal caller targets a branch +- **THEN** the caller interacts with `HeadOps` using the branch name rather than a head ref + +#### Scenario: Index-targeted workflow uses opaque index id +- **WHEN** an internal caller targets an index +- **THEN** the caller interacts with `HeadOps` using an opaque index id rather than an index ref + +### Requirement: HeadOps supports atomic commit updates for pointers +The system SHALL update branch and index commits through `update_branch_commit` and `update_index_commit` methods that require the caller to provide the expected current commit. + +#### Scenario: Expected commit matches +- **WHEN** a caller requests a branch or index commit update with the correct current commit +- **THEN** `HeadOps` stores the new commit atomically + +#### Scenario: Expected commit is stale +- **WHEN** a caller requests a branch or index commit update with an outdated current commit +- **THEN** `HeadOps` rejects the update and raises a dedicated conflict error + +### Requirement: Conflict error reports current commit for retries +The system SHALL raise a dedicated `DmlRepoError` subclass for stale branch/index updates, and that exception SHALL expose the correct `current_commit`. + +#### Scenario: Caller retries after stale index update +- **WHEN** `update_index_commit` fails because the stored commit changed +- **THEN** the raised conflict error includes the current stored commit for the caller to inspect and retry from + +### Requirement: HeadOps public methods support caller-owned transactions +The system SHALL keep transaction-aware behavior limited to `create_branch`, and all other public `HeadOps` pointer-management methods SHALL operate without caller-owned transactions. + +#### Scenario: Caller provides transaction to create_branch +- **WHEN** a caller invokes `create_branch(..., txn=...)` +- **THEN** `HeadOps` uses that transaction only for bootstrap commit creation, closes it only if `HeadOps` opened it, and does not create the branch file until the transaction that created the commit has been closed successfully + +#### Scenario: Caller invokes non-bootstrap pointer method +- **WHEN** a caller invokes any `HeadOps` pointer lookup, listing, update, create-index, or delete-index method other than `create_branch` +- **THEN** the method performs only `.dml/refs/**` file I/O and stale-write checks without accepting or requiring a transaction or validating commit existence in LMDB + +#### Scenario: Index deletion remains plain HeadOps cleanup +- **WHEN** a caller asks `HeadOps` to delete an index ref +- **THEN** `HeadOps` removes the index file as a plain file operation and does not require compare-and-delete semantics + +### Requirement: HeadOps owns persisted checkout state +The system SHALL route `.dml/HEAD` creation, parsing, update, and commit resolution through `HeadOps` public methods rather than allowing callers to read or write the checkout-state file directly. + +#### Scenario: Non-HeadOps caller needs current checkout state +- **WHEN** an internal caller needs to know whether the repository is attached or detached +- **THEN** it obtains that state through a `HeadOps` public method instead of reading `.dml/HEAD` directly + +#### Scenario: Repository bootstrap creates attached HEAD +- **WHEN** repository initialization creates the initial local branch +- **THEN** `HeadOps` persists `.dml/HEAD` as `ref: refs/local/heads/` for that branch + +### Requirement: HeadOps persists HEAD using two plain-text forms only +The system SHALL persist `.dml/HEAD` using exactly one of two plain-text payload forms: `ref: refs/local/heads/` for attached mode or `commit:` for detached mode. + +#### Scenario: Attached HEAD is written +- **WHEN** a checkout operation attaches to local branch `feature` +- **THEN** `.dml/HEAD` contains exactly `ref: refs/local/heads/feature` + +#### Scenario: Detached HEAD is written +- **WHEN** a checkout operation detaches at commit `commit:abc123` +- **THEN** `.dml/HEAD` contains exactly `commit:abc123` + +#### Scenario: Invalid HEAD payload fails closed +- **WHEN** `.dml/HEAD` contains any other payload form +- **THEN** `HeadOps` rejects the repository state and does not guess an alternate checkout target + +### Requirement: HeadOps resolves HEAD to the active commit +The system SHALL resolve `.dml/HEAD` to a concrete commit ref by following the attached local branch ref or by returning the detached commit directly. + +#### Scenario: Attached HEAD resolves through local branch ref +- **WHEN** `.dml/HEAD` contains `ref: refs/local/heads/main` +- **THEN** `HeadOps` resolves HEAD to the commit stored at `.dml/refs/local/heads/main` + +#### Scenario: Detached HEAD resolves directly +- **WHEN** `.dml/HEAD` contains `commit:abc123` +- **THEN** `HeadOps` resolves HEAD to `commit:abc123` without consulting any branch ref diff --git a/openspec/specs/human-facing-project-docs/spec.md b/openspec/specs/human-facing-project-docs/spec.md new file mode 100644 index 0000000..09d09c4 --- /dev/null +++ b/openspec/specs/human-facing-project-docs/spec.md @@ -0,0 +1,60 @@ +### Requirement: `docs/` SHALL be reserved for human-facing project documentation +The repository SHALL treat `docs/` as the human-facing project documentation surface that describes DaggerML as it exists, while agent-facing change-planning artifacts remain in `openspec/` and maintainer workflow rules live outside `docs/`. + +#### Scenario: Human reader enters the docs tree +- **WHEN** a reader opens `docs/` +- **THEN** the visible content describes the product, its usage, its concepts, or its architecture for humans rather than agent workflow or change-planning procedure + +#### Scenario: Agent-facing planning remains outside project docs +- **WHEN** a reader needs change proposals, implementation tasks, or requirement deltas for a change +- **THEN** those artifacts are found under `openspec/` rather than inside `docs/` + +### Requirement: Project docs SHALL be organized by reader intent +The `docs/` tree SHALL organize its primary navigation around reader intent with a docs home, one getting-started page, concept docs, guides, reference docs, architecture docs, and a contrib subtree using the same broad model. + +#### Scenario: Reader looks for onboarding +- **WHEN** a new reader wants the fastest path to first success +- **THEN** `docs/README.md` points to a single `docs/getting-started.md` page rather than a fragmented getting-started subtree + +#### Scenario: Reader looks for the right kind of information +- **WHEN** a reader wants a mental model, a workflow, an exact command surface, or an internal system explanation +- **THEN** the docs navigation distinguishes those needs through concepts, guides, reference, and architecture sections + +### Requirement: `getting-started` SHALL be one concise page +The project docs SHALL provide a single getting-started page that covers installation, first repository setup, first DAG creation, basic inspection, and next-step links without splitting those basics across multiple introductory files. + +#### Scenario: Reader starts from zero +- **WHEN** a reader follows `docs/getting-started.md` +- **THEN** the page includes enough information to install DaggerML, create or select a repo, create a first DAG, and inspect it with at least one simple command or API example + +### Requirement: Human-facing docs SHALL avoid normative spec voice +Docs under `docs/` SHALL describe the system in reader-facing language and SHALL avoid structuring pages around authority ownership, compatibility classifications, or normative maintenance phrases such as document-level handoff rules. + +#### Scenario: Reader opens a topic doc +- **WHEN** a reader opens a concept, guide, reference, or architecture page under `docs/` +- **THEN** the document leads with explanation of the subject matter instead of an authority or governance preamble + +### Requirement: Existing technical content SHALL be preserved through translation, not path churn +When current docs are reorganized, the implementation SHALL preserve useful technical knowledge by rewriting and reclassifying existing material into concept, guide, reference, or architecture pages rather than merely renaming files or deleting depth. + +#### Scenario: Existing detailed doc is migrated +- **WHEN** a current technical document contains valuable behavioral or architectural explanation +- **THEN** the new docs structure preserves that information in an appropriate human-facing page even if the original path or tone changes + +### Requirement: Maintainer workflow docs SHALL leave `docs/` +Repository-maintenance documents such as edit pre-read maps, spec-governance indexes, and contributor test-taxonomy policy SHALL not remain in the human-facing `docs/` tree after the reorganization. + +#### Scenario: Reader encounters maintainer guidance +- **WHEN** a maintainer needs edit workflow or contributor-policy guidance +- **THEN** that guidance is located in a maintainer-facing location outside `docs/` + +### Requirement: Docs rewrite tasks SHALL be independently assignable to repo-aware subagents +The reorganization plan SHALL divide implementation work into independent documentation tasks whose owners first inspect the existing repo, current docs, and relevant code for the area they are rewriting. + +#### Scenario: Subagent rewrites a docs area +- **WHEN** a subagent is assigned a docs subtree or topic lane +- **THEN** the task instructions require that subagent to read the current docs for that area and inspect the corresponding source modules before producing rewritten docs + +#### Scenario: Parallel docs work proceeds safely +- **WHEN** multiple subagents work on different doc lanes such as concepts, reference, architecture, or contrib +- **THEN** the task boundaries are specific enough that each subagent can make progress independently without redefining the whole docs architecture diff --git a/openspec/specs/indexops-optimistic-ref-publication/spec.md b/openspec/specs/indexops-optimistic-ref-publication/spec.md new file mode 100644 index 0000000..ef3e3cd --- /dev/null +++ b/openspec/specs/indexops-optimistic-ref-publication/spec.md @@ -0,0 +1,29 @@ +## ADDED Requirements + +### Requirement: IndexOps publishes index mutations through post-transaction compare-and-swap +The system SHALL have affected `IndexOps` mutation paths derive new commits in LMDB before publishing them through `HeadOps` compare-and-swap operations on file-backed index or branch refs. + +#### Scenario: Index mutation publishes after LMDB commit +- **WHEN** an `IndexOps` mutation updates an existing index state +- **THEN** it reads the base commit through `HeadOps`, writes the new immutable commit in an LMDB write transaction, closes that transaction, and only then asks `HeadOps` to compare-and-swap the index ref to the new commit + +### Requirement: IndexOps retries from the current stored commit after stale ref conflicts +The system SHALL retry affected `IndexOps` mutation paths when `HeadOps` reports a stale pointer conflict, using the conflict's current stored commit as the next base commit. + +#### Scenario: Index compare-and-swap loses a race +- **WHEN** `HeadOps.update_index_commit` rejects an `IndexOps` publication attempt with `DmlPointerConflictError(current_commit=commit:new)` +- **THEN** `IndexOps` starts a fresh LMDB write transaction using `commit:new` as the base commit and rebuilds the mutation before retrying publication + +### Requirement: Branch-targeted index commits publish branch movement after commit creation +The system SHALL publish branch advancement for `IndexOps.commit(..., head=...)` only after the new commit has been durably created in LMDB. + +#### Scenario: Branch-backed commit finalization +- **WHEN** `IndexOps.commit` finalizes a working index onto a branch +- **THEN** it writes the new commit in LMDB, closes the LMDB transaction, and only then asks `HeadOps` to advance the branch from the expected old commit to the new commit + +### Requirement: Detached scratch commit helpers do not create temporary index refs +The system SHALL build builtin and failed-execution scratch commit state without publishing temporary index refs under `.dml/refs/local/indexes`. + +#### Scenario: Builtin helper constructs scratch commit +- **WHEN** builtin execution needs a temporary DAG commit to materialize a result +- **THEN** the helper builds detached scratch commit state directly in LMDB and returns the resulting DAG/commit refs without creating or deleting a temporary index ref file diff --git a/openspec/specs/init-input-normalization/spec.md b/openspec/specs/init-input-normalization/spec.md new file mode 100644 index 0000000..8c4292b --- /dev/null +++ b/openspec/specs/init-input-normalization/spec.md @@ -0,0 +1,17 @@ +### Requirement: Init accepts optional remote capabilities +The init operation MUST accept optional `remote_project` and optional `remote_root` inputs. Init MUST allow both values to be omitted for local read-only repository bootstrap. + +#### Scenario: Init without remote configuration +- **WHEN** init is called with no `remote_project` and no `remote_root` +- **THEN** init succeeds without deriving or persisting project publication identity + +#### Scenario: Init with remote root only +- **WHEN** init is called with `remote_root` and no `remote_project` +- **THEN** init succeeds and configures remote-backed mutation and execution capability without project sync capability + +### Requirement: Init rejects project identity without remote root +The init operation MUST reject `remote_project` when `remote_root` is absent. + +#### Scenario: Project URI without remote root +- **WHEN** init is called with `remote_project` and no `remote_root` +- **THEN** init fails with a descriptive validation error stating that `remote.root` is required when `remote.project` is configured diff --git a/openspec/specs/is-node-like-predicate/spec.md b/openspec/specs/is-node-like-predicate/spec.md new file mode 100644 index 0000000..cd5e61c --- /dev/null +++ b/openspec/specs/is-node-like-predicate/spec.md @@ -0,0 +1,41 @@ +### Requirement: is_node_like predicate exists in contrib API +The system SHALL provide a public function `is_node_like(x)` in `daggerml.contrib.api` that returns `True` if and only if `x` is an instance of `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable`. + +#### Scenario: Node instance is node-like +- **WHEN** `is_node_like(x)` is called with a `Node` instance +- **THEN** it returns `True` + +#### Scenario: DelayedRef is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedRef` instance +- **THEN** it returns `True` + +#### Scenario: DelayedLoad is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedLoad` instance +- **THEN** it returns `True` + +#### Scenario: DelayedRunnable is node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedRunnable` instance +- **THEN** it returns `True` + +#### Scenario: Plain value is not node-like +- **WHEN** `is_node_like(x)` is called with a plain Python value (str, int, list, None, etc.) +- **THEN** it returns `False` + +#### Scenario: DelayedActionCodec is not node-like +- **WHEN** `is_node_like(x)` is called with a `DelayedActionCodec` instance (the internal codec wrapper) +- **THEN** it returns `False` + +### Requirement: SshExecutor uses is_node_like for field validation +`SshExecutor._validate_kw` SHALL use `is_node_like` to accept node-like values for the `host` and `flags` fields instead of checking `isinstance(x, DelayedActionCodec)` directly. + +#### Scenario: Node-like host passes validation +- **WHEN** `_validate_kw` is called with `host` set to a `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable` +- **THEN** validation passes without error + +#### Scenario: Node-like flags passes validation +- **WHEN** `_validate_kw` is called with `flags` set to a `Node`, `DelayedRef`, `DelayedLoad`, or `DelayedRunnable` +- **THEN** validation passes without error + +#### Scenario: Invalid host still raises error +- **WHEN** `_validate_kw` is called with `host` set to an empty string or a non-node-like non-string +- **THEN** a `DmlRepoError` is raised diff --git a/openspec/specs/remote-project-refs/spec.md b/openspec/specs/remote-project-refs/spec.md new file mode 100644 index 0000000..3f952be --- /dev/null +++ b/openspec/specs/remote-project-refs/spec.md @@ -0,0 +1,336 @@ +## ADDED Requirements + +### Requirement: Remote project refs namespace +The system SHALL store project branch and tag refs under `refs/projects///{heads,tags}/` within the remote protocol root. + +#### Scenario: Branch head path +- **WHEN** project `alice/demo` branch `main` is addressed on the remote +- **THEN** the branch head ref path is `refs/projects/alice/demo/heads/main.json` + +#### Scenario: Tag path +- **WHEN** project `alice/demo` tag `v1.0` is addressed on the remote +- **THEN** the tag ref path is `refs/projects/alice/demo/tags/v1.0.json` + +### Requirement: Branch heads are mutable and project tags are immutable +The system SHALL allow project branch head refs to move through safe update operations and SHALL reject attempts to overwrite existing project tag refs. + +#### Scenario: Branch head update +- **WHEN** a push safely advances project `alice/demo` branch `main` +- **THEN** the existing `refs/projects/alice/demo/heads/main.json` ref may be replaced by the new branch head payload + +#### Scenario: Tag overwrite rejected +- **WHEN** `refs/projects/alice/demo/tags/v1.0.json` already exists +- **THEN** publishing tag `v1.0` fails without changing the existing tag ref + +### Requirement: Project refs use manifest ref payloads +The system SHALL encode project branch and tag refs using the existing remote ref payload schema for manifest refs. + +Project branch and tag refs SHALL point to commit manifests, SHALL include direct DAG `targets`, and SHALL fail before writing the ref if the target manifest is missing, invalid, or has `closure["dag"]` inconsistent with the ref `targets["dag"]`. + +#### Scenario: Project branch ref payload +- **WHEN** project `alice/demo` branch `main` is written +- **THEN** `refs/projects/alice/demo/heads/main.json` contains `kind`, `schema`, `target`, `created_at`, `targets`, and `meta` fields following the remote ref schema + +#### Scenario: Project tag ref payload +- **WHEN** project `alice/demo` tag `v1.0` is written +- **THEN** `refs/projects/alice/demo/tags/v1.0.json` contains `kind`, `schema`, `target`, `created_at`, `targets`, and `meta` fields following the remote ref schema + +#### Scenario: Project ref target validation fails closed +- **WHEN** a project branch or tag ref would point to a missing manifest, invalid manifest, non-commit manifest, or inconsistent direct DAG targets +- **THEN** the write fails without creating or updating the project ref + +### Requirement: Shared remote CAS +The system SHALL store immutable CAS objects in a shared remote CAS under `cas/sha256///` independent of owner, project, or branch. + +#### Scenario: Two projects reference same object +- **WHEN** two project refs target manifests that include the same CAS object +- **THEN** the remote stores that CAS object at one shared CAS path + +### Requirement: Global DML config +The system SHALL load global DML config from `$DML_CONFIG_HOME/config.toml`, `$XDG_CONFIG_HOME/dml/config.toml`, or `~/.config/dml/config.toml` in that precedence order. + +#### Scenario: DML config home wins +- **WHEN** `DML_CONFIG_HOME` is set +- **THEN** the system reads global config from `$DML_CONFIG_HOME/config.toml` + +#### Scenario: XDG config home fallback +- **WHEN** `DML_CONFIG_HOME` is unset and `XDG_CONFIG_HOME` is set +- **THEN** the system reads global config from `$XDG_CONFIG_HOME/dml/config.toml` + +#### Scenario: Default config fallback +- **WHEN** neither `DML_CONFIG_HOME` nor `XDG_CONFIG_HOME` is set +- **THEN** the system reads global config from `~/.config/dml/config.toml` + +### Requirement: Global user defaults +The system SHALL use global config for user defaults and bootstrap hook configuration. + +#### Scenario: Default project owner +- **WHEN** global config contains `[user].name = "alice"` and `dml init demo` omits an owner +- **THEN** the project owner is `alice` + +#### Scenario: Default branch +- **WHEN** global config contains `[defaults].branch = "main"` and `dml init demo` omits a branch +- **THEN** the initial branch is `main` + +### Requirement: Local remote config +The system SHALL store project-local config under `.dml/config.toml` containing branchless project identity and remote storage settings. The current checkout branch MUST NOT be stored in local config. + +#### Scenario: Resolve origin main +- **WHEN** local config defines project identity `dml://alice/demo` and the attached local branch is `main` +- **THEN** `dml push` resolves the default remote target as project owner `alice`, project `demo`, and branch `main` + +#### Scenario: Project fields are stored +- **WHEN** local project config is written for project `alice/demo` +- **THEN** `.dml/config.toml` contains `[project].uri = "dml://alice/demo"` and does not contain branch-selection fields + +#### Scenario: Remote fields are stored +- **WHEN** local project config records the remote storage URI for project `alice/demo` +- **THEN** `.dml/config.toml` contains the configured `[remote]` fields and no local checkout branch field + +#### Scenario: Reject branch-qualified local project URI +- **WHEN** local config would store `dml://alice/demo#main` or `dml://alice/demo@v1` +- **THEN** config validation fails without writing the selector-bearing URI + +### Requirement: Config waterfall precedence +The system SHALL resolve configurable values using explicit CLI/API arguments first, environment variables second, and config file values last. Checkout-state selection is not part of this waterfall and SHALL be resolved from `.dml/HEAD`. + +#### Scenario: Explicit value wins over environment +- **WHEN** a command receives an explicit mutable branch argument and environment variables also provide configuration inputs +- **THEN** the command uses the explicit branch argument for that mutable branch target + +#### Scenario: Environment does not override checkout state +- **WHEN** a command omits an explicit branch argument and environment variables are resolved +- **THEN** the command still derives the current checkout from `.dml/HEAD` rather than from configuration environment variables + +#### Scenario: Config used as fallback for non-checkout values +- **WHEN** a command omits explicit overrides and no matching environment value is set +- **THEN** the command uses configured values such as `remote.project`, `remote.root`, or `default_branch` but not a config-derived current branch + +#### Scenario: Remote storage env vars override config +- **WHEN** `DML_REMOTE_BUCKET` or `DML_REMOTE_PREFIX` is set for a remote operation +- **THEN** the command uses the environment value instead of the configured remote storage field + +### Requirement: Supported DML environment variables +The system SHALL support only the DML environment variables defined for the project model and SHALL treat hook context variables as output-only process context. `DML_BRANCH` is not a supported environment variable. + +#### Scenario: Global config home override +- **WHEN** `DML_CONFIG_HOME` is set +- **THEN** the global DML config directory resolves from `DML_CONFIG_HOME` + +#### Scenario: Existing user env remains supported +- **WHEN** `DML_USER` is set and an owner is omitted +- **THEN** the system uses `DML_USER` as the default project owner + +#### Scenario: DML_BRANCH is rejected as unsupported +- **WHEN** `DML_BRANCH` is set during project or runtime command resolution +- **THEN** the system does not use it as checkout state or branch selection input + +#### Scenario: Project env overrides config +- **WHEN** `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, or `DML_REMOTE_PROJECT` is set +- **THEN** the corresponding project config value is overridden for that command + +#### Scenario: Remote env overrides config +- **WHEN** `DML_REMOTE_ROOT`, `DML_REMOTE_BUCKET`, or `DML_REMOTE_PREFIX` is set +- **THEN** the corresponding remote selection or storage value is overridden for that command + +#### Scenario: Hook context env is provided by DML +- **WHEN** a hook command runs +- **THEN** DML sets `DML_HOOK`, `DML_PROJECT_HOME`, and, for clone hooks, `DML_REMOTE_NAME` + +### Requirement: Project commands use project-local state and current env names only +The system SHALL resolve project-local state from the project directory and SHALL use only the current supported environment variable surface for git-like project operations. + +#### Scenario: Project config comes from the project directory +- **WHEN** a project command resolves project-local config +- **THEN** it reads from `/.dml/config.toml` + +#### Scenario: DML_REPO is not used for project database +- **WHEN** a project command opens the local object database +- **THEN** it uses `/.dml/db/` and does not use `DML_REPO` + +#### Scenario: DML_REMOTE_ROOT is not used for named remotes +- **WHEN** a remote project command resolves remote storage +- **THEN** it uses named remote bucket/prefix config or `DML_REMOTE_BUCKET` and `DML_REMOTE_PREFIX`, not `DML_REMOTE_ROOT` + +#### Scenario: Removed execution/cache env vars are ignored +- **WHEN** `DML_DYNAMODB_TABLE` or `DML_REMOTE_CACHE` is set during a git-like project operation +- **THEN** the operation does not use those values + +### Requirement: Project directory initialization +The system SHALL initialize local project state under `/.dml/` for `init`. + +#### Scenario: Init creates DML directory +- **WHEN** `dml init demo` succeeds +- **THEN** the system creates `demo/.dml/`, `demo/.dml/config.toml`, `.dml/HEAD`, and local database storage under `demo/.dml/db/` + +#### Scenario: Init refuses existing child directory +- **WHEN** `dml init demo` runs and `demo/` already exists +- **THEN** init fails and instructs the user to initialize that directory with `dml init --here demo` + +#### Scenario: Init here creates DML directory in current directory +- **WHEN** `dml init --here demo` succeeds from the current directory +- **THEN** the system creates `.dml/`, `.dml/config.toml`, `.dml/HEAD`, and local database storage under `.dml/db/` + +#### Scenario: Init here uses provided project name +- **WHEN** `dml init --here demo` succeeds from directory `workdir` +- **THEN** the local project name is `demo` + +#### Scenario: Init creates DML gitignore +- **WHEN** `dml init demo` succeeds +- **THEN** the system writes `demo/.dml/.gitignore` containing `*` + +#### Scenario: Init creates initial branch and attaches HEAD +- **WHEN** `dml init demo` succeeds +- **THEN** local storage contains an initial empty commit/tree, local branch `main`, and `.dml/HEAD` attached to `main` + +### Requirement: Init shell hooks +The system SHALL support `post-init` shell hooks from global DML config that run in the project directory after `.dml/` exists. + +#### Scenario: Init hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init demo` runs +- **THEN** the hook command runs in the `demo` project directory after `demo/.dml/` exists + +#### Scenario: Init here hook succeeds +- **WHEN** a `post-init` hook command is configured and `dml init --here demo` runs +- **THEN** the hook command runs in the current directory after `.dml/` exists + +#### Scenario: Hooks run in configured order +- **WHEN** multiple `post-init` hook commands are configured and `dml init demo` runs +- **THEN** the hook commands run in their configured list order + +#### Scenario: Init no-hooks skips hooks +- **WHEN** `dml init --no-hooks demo` runs +- **THEN** no `post-init` hook commands run + +#### Scenario: Hook environment omits removed branch env +- **WHEN** a `post-init` hook command runs +- **THEN** the process environment includes `DML_HOOK`, `DML_PROJECT_HOME`, `DML_PROJECT_NAME`, `DML_PROJECT_OWNER`, and `DML_CONFIG_HOME`, and does not include `DML_BRANCH` + +### Requirement: DML URIs track fetched remote refs +The system SHALL track fetched remote branches and tags locally by canonical normalized DML URI. + +#### Scenario: Store fetched branch tracking ref +- **WHEN** `dml fetch dml://alice/tools#main` succeeds +- **THEN** local storage tracks `dml://alice/tools#main` as pointing to the resolved commit + +#### Scenario: Store fetched tag tracking ref +- **WHEN** `dml fetch dml://alice/tools@v1.0` succeeds +- **THEN** local storage tracks `dml://alice/tools@v1.0` as pointing to the resolved commit + +#### Scenario: Tracking ref stores commit pointer +- **WHEN** a fetched remote ref is persisted locally +- **THEN** the persisted tracking ref contains the resolved commit pointer + +#### Scenario: Canonical URI head is stored +- **WHEN** a remote fetch resolves project `alice/tools` branch `main` +- **THEN** the local tracking ref is stored under canonical URI `dml://alice/tools#main` + +#### Scenario: Derived expression is not stored as URI head +- **WHEN** a remote operation resolves a derived expression such as `HEAD~2` +- **THEN** the system stores only the canonical project branch or tag URI for any tracking head it writes + +#### Scenario: URI tracking ref length is validated +- **WHEN** a command would create a tracking ref whose canonical DML URI exceeds 64 bytes +- **THEN** the command fails without writing the tracking ref + +#### Scenario: Overlong URI is rejected directly +- **WHEN** a canonical DML URI exceeds 64 bytes +- **THEN** the system rejects it and does not hash or rewrite it into an alternate tracking key + +#### Scenario: URI tracking ref characters are validated explicitly +- **WHEN** a command would create a DML URI tracking ref +- **THEN** the system validates the canonical URI as a DML project URI before writing the tracking ref + +#### Scenario: User-facing DML URI resolves to local tracking ref +- **WHEN** a user-facing command receives `dml://alice/tools#main` +- **THEN** the command resolves it locally through the tracking ref for `dml://alice/tools#main` + +### Requirement: Remote operations parse DML URIs +The system SHALL parse and canonicalize DML revision URIs through one centralized shared revision URI parser/stringifier boundary before deriving remote project ref paths. + +#### Scenario: Push parses branch URI through shared parser +- **WHEN** push targets canonical URI `dml://alice/demo#main` +- **THEN** remote operations derive `refs/projects/alice/demo/heads/main.json` from the shared parsed revision object + +#### Scenario: Fetch parses tag URI through shared parser +- **WHEN** fetch targets canonical URI `dml://alice/demo@v1.0` +- **THEN** remote operations derive `refs/projects/alice/demo/tags/v1.0.json` from the shared parsed revision object + +#### Scenario: Branch/tag capability checks remain operation-specific +- **WHEN** a mutation operation targets the wrong selector type (branch op with tag URI, or tag op with branch URI) +- **THEN** the operation fails at method boundary capability checks even though URI parsing/canonicalization succeeds + +### Requirement: Project creation owner default +The system SHALL default project owner to the configured current user when project creation omits an owner. + +#### Scenario: Create project without owner +- **WHEN** the configured user is `alice` and project `demo` is created without an explicit owner +- **THEN** the project URI is `dml://alice/demo` + +### Requirement: Fetch updates remote-tracking head +The system SHALL fetch a remote project branch by reading its branch head ref, materializing the referenced commit closure locally, and updating a local remote-tracking head. + +#### Scenario: Fetch origin main +- **WHEN** `dml fetch origin main` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/demo#main` as pointing to the fetched commit + +#### Scenario: Fetch explicit project URI +- **WHEN** `dml fetch dml://alice/tools#main` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/tools#main` as pointing to the fetched commit + +#### Scenario: Fetch explicit project tag URI +- **WHEN** `dml fetch dml://alice/tools@v1.0` succeeds +- **THEN** local storage contains the fetched commit closure and tracks `dml://alice/tools@v1.0` as pointing to the fetched commit + +### Requirement: Pull fetches and merges +The system SHALL implement branch pull as fetch followed by merge of the fetched remote-tracking head into the current branch. + +#### Scenario: Pull origin main +- **WHEN** `dml pull origin main` succeeds while the current branch is `main` +- **THEN** local tracking ref `dml://alice/demo#main` is updated and local branch `main` advances to the merge result or fetched commit when already fast-forwardable + +#### Scenario: Pull different branch fails +- **WHEN** the current branch is `feature` and the user runs `dml pull origin main` +- **THEN** pull fails without merging or advancing the current branch + +### Requirement: Push uses ETag and fast-forward safety +The system SHALL update remote branch heads only with an ETag conditional write and SHALL reject non-fast-forward pushes unless force is requested. + +#### Scenario: Fast-forward push +- **WHEN** the remote branch head is an ancestor of the local branch head and the observed ETag still matches +- **THEN** push updates the remote branch head to the local commit + +#### Scenario: Non-fast-forward push rejected +- **WHEN** the remote branch head is not an ancestor of the local branch head and force is not requested +- **THEN** push fails without updating the remote branch head + +#### Scenario: Force push keeps ETag safety +- **WHEN** force is requested and the observed ETag no longer matches +- **THEN** push fails without updating the remote branch head + +#### Scenario: Push missing branch without create fails +- **WHEN** push targets a remote branch ref that does not exist and `--create` is not provided +- **THEN** push fails without creating the remote branch ref + +#### Scenario: Push missing branch with create succeeds +- **WHEN** push targets a remote branch ref that does not exist and `--create` is provided +- **THEN** push writes the remote branch ref only if it still does not exist + +#### Scenario: Create push loses race +- **WHEN** push uses `--create` and another client creates the remote branch ref first +- **THEN** push fails without overwriting the remote branch ref + +### Requirement: Project sync commands require configured local project URI +The system SHALL require configured local `remote.project` before resolving default project-addressed remote refs for push, pull, fetch, or checkout flows. + +#### Scenario: Push without configured project URI +- **WHEN** a repository has `remote.root` but no `remote.project` and push is requested +- **THEN** push fails with a descriptive error stating that `remote.project` is required for project sync + +#### Scenario: Pull without configured project URI +- **WHEN** a repository has `remote.root` but no `remote.project` and pull or fetch-by-project is requested +- **THEN** the operation fails with a descriptive error stating that `remote.project` is required for project sync + +#### Scenario: Checkout on init requires configured project URI +- **WHEN** init resolves `remote.root` but not `remote.project` +- **THEN** init does not attempt project-addressed fetch or checkout diff --git a/openspec/specs/repo-inspection-cli/spec.md b/openspec/specs/repo-inspection-cli/spec.md new file mode 100644 index 0000000..1a7fa37 --- /dev/null +++ b/openspec/specs/repo-inspection-cli/spec.md @@ -0,0 +1,99 @@ +### Requirement: Top-level CLI uses git-shaped repository inspection verbs +The public `dml` CLI SHALL expose repository-oriented porcelain commands at the top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert`. + +#### Scenario: Top-level help reflects git-shaped porcelain +- **WHEN** a user inspects the top-level CLI surface +- **THEN** the documented primary commands are `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, and `revert` + +### Requirement: Status reports repository state instead of config state +`dml status` SHALL report current repository/runtime status as JSON, including the current HEAD state, available local branches, DAG map for the current revision, and live indexes. + +#### Scenario: Status returns repository summary +- **WHEN** a user runs `dml status` +- **THEN** the command returns JSON with `head`, `branches`, `dags`, and `indexes` fields + +### Requirement: Show returns commit metadata, full DAG map, and commit delta +`dml show ` SHALL resolve the revision locally and return JSON with top-level `revision`, `commit`, `dags`, and `change` fields. + +The `dags` field SHALL be the full DAG name-to-ref map for the resolved revision. The `change` field SHALL describe the DAG-map delta introduced by the resolved commit relative to its base commit. + +#### Scenario: Show returns full DAG map and change +- **WHEN** a user runs `dml show HEAD` +- **THEN** the command returns JSON containing `revision`, `commit`, `dags`, and `change` +- **AND** `dags` contains the complete DAG map for the resolved commit + +#### Scenario: Show root commit uses empty base +- **WHEN** a user runs `dml show` on a root commit with no parents +- **THEN** `change.base` is `null` +- **AND** every DAG in `dags` appears as an addition in `change` + +#### Scenario: Show merge commit uses first parent as base +- **WHEN** a user runs `dml show` on a merge commit with multiple parents +- **THEN** `change` is computed relative to the first parent commit + +### Requirement: Diff compares DAG maps between revisions +`dml diff [] []` SHALL compare two locally resolved revisions and return DAG-map differences as JSON `added`, `removed`, and `updated` sections. + +#### Scenario: Diff returns DAG map changes +- **WHEN** a user runs `dml diff main feature` +- **THEN** the command returns JSON with `left`, `right`, `added`, `removed`, and `updated` fields + +### Requirement: Log returns commit entries for a revision walk +`dml log [] [--limit N]` SHALL return commit entries starting from the resolved revision, defaulting to `HEAD`. + +#### Scenario: Log defaults to HEAD +- **WHEN** a user runs `dml log` +- **THEN** the command resolves `HEAD` +- **AND** returns JSON containing `revision` and `commits` + +### Requirement: Branch listing supports local and remote-tracking views +`dml branch` SHALL list local branches. `dml branch -r` and `dml branch --remote` SHALL list remote-tracking branches. + +#### Scenario: Branch lists local branches by default +- **WHEN** a user runs `dml branch` +- **THEN** the command returns JSON with a `branches` field containing local branch names + +#### Scenario: Branch lists remote-tracking branches +- **WHEN** a user runs `dml branch --remote` +- **THEN** the command returns JSON with a `branches` field containing remote-tracking branch selectors + +### Requirement: DAG inspection is organized under `dml dag` +The CLI SHALL expose DAG-oriented inspection commands under `dml dag`: `list`, `get`, `checkout`, and `delete`. + +#### Scenario: DAG commands are grouped under dag +- **WHEN** a user inspects DAG-related CLI help +- **THEN** DAG inspection and DAG tree mutation commands appear under `dml dag` + +### Requirement: DAG list returns revision-scoped DAG map +`dml dag list [--revision REV]` SHALL return the DAG name-to-ref map for the selected revision as JSON. + +#### Scenario: DAG list returns mapping +- **WHEN** a user runs `dml dag list --revision HEAD~1` +- **THEN** the command returns JSON with `revision` and `dags` +- **AND** `dags` is an object mapping DAG names to DAG refs + +### Requirement: DAG get resolves by name or exact DAG ref +`dml dag get [--revision REV]` SHALL resolve either a DAG name within a revision's DAG map or an explicit `dag:` selector. + +If the selector is `dag:`, the command SHALL reject any provided `--revision` flag. + +#### Scenario: DAG get resolves name in revision +- **WHEN** a user runs `dml dag get train --revision HEAD~1` +- **THEN** the command resolves `train` in the DAG map for `HEAD~1` +- **AND** returns JSON containing `selector`, `revision`, and `dag` + +#### Scenario: DAG get loads exact DAG ref +- **WHEN** a user runs `dml dag get dag:abc123` +- **THEN** the command loads that exact DAG object +- **AND** returns JSON containing `selector` and `dag` + +#### Scenario: DAG get rejects revision with explicit DAG ref +- **WHEN** a user runs `dml dag get dag:abc123 --revision HEAD` +- **THEN** the command fails without resolving a revision + +### Requirement: DAG get includes node data +The `dml dag get` payload SHALL include the DAG's node data so that users do not need a separate DAG-node inspection endpoint for normal CLI workflows. + +#### Scenario: DAG get includes nodes +- **WHEN** a user runs `dml dag get train` +- **THEN** the returned `dag` object includes node-level data needed for DAG inspection diff --git a/openspec/specs/required-remote-config/spec.md b/openspec/specs/required-remote-config/spec.md new file mode 100644 index 0000000..b484fe2 --- /dev/null +++ b/openspec/specs/required-remote-config/spec.md @@ -0,0 +1,29 @@ +### Requirement: Remote-aware components require explicit remote configuration +The system SHALL require explicit remote configuration at the constructor or helper boundary for any runtime or ops component that performs remote-backed behavior. Remote-aware interfaces MUST receive normalized `remote.root` configuration from the shared internal configuration resolver rather than reading raw environment variables or project config files themselves. + +#### Scenario: Remote-aware ops constructor requires remote URI +- **WHEN** a remote-aware ops type is defined +- **THEN** its constructor signature requires a concrete normalized remote URI argument rather than an optional remote parameter + +#### Scenario: Remote-aware runtime helper requires remote configuration +- **WHEN** a runtime helper delegates to remote-backed behavior +- **THEN** it passes explicit remote configuration to the remote-aware component it constructs + +#### Scenario: Remote-aware component does not resolve env vars directly +- **WHEN** a remote-aware runtime or ops component is used in a remote-backed flow +- **THEN** it receives already-resolved remote configuration from its caller instead of inspecting `DML_REMOTE`, older remote env-var forms, or project config files directly + +#### Scenario: Init fails when required remote URI cannot resolve validly +- **WHEN** the shared `Dml` init/bootstrap workflow requires remote-backed bootstrap behavior and shared config resolution does not produce a valid `remote.root` +- **THEN** init fails with a configuration error instead of proceeding with unresolved or implicit remote configuration + +### Requirement: Project sync operations require project identity in addition to remote root +The system SHALL require configured `remote.project` for project-addressed sync behavior such as push, pull, fetch, and init-time project checkout. These operations MUST fail closed when `remote.root` exists but `remote.project` is absent. + +#### Scenario: Remote-backed mutation without project identity remains allowed +- **WHEN** a runtime or mutation operation requires only remote-backed storage or execution capability +- **THEN** configured `remote.root` is sufficient even when `remote.project` is absent + +#### Scenario: Project sync operation without project identity is rejected +- **WHEN** a project-addressed sync operation is requested and resolved config has no `remote.project` +- **THEN** the operation fails with a descriptive error instead of deriving project identity implicitly diff --git a/openspec/specs/revision-parsing-contract-matrix/spec.md b/openspec/specs/revision-parsing-contract-matrix/spec.md new file mode 100644 index 0000000..b0ebecf --- /dev/null +++ b/openspec/specs/revision-parsing-contract-matrix/spec.md @@ -0,0 +1,41 @@ +## Purpose +Establish a single contract-matrix owner for revision/ref/URI parsing behaviors so grammar coverage is centralized and workflow tests stay focused on operational invariants. + +## Requirements + +### Requirement: Revision and URI parsing contracts are centrally owned by one parameterized matrix suite +The repository SHALL define revision/ref/URI parsing behavior in one maintained contract test suite that uses parameterized case matrices rather than duplicating equivalent parsing assertions across workflow tests. + +#### Scenario: Parsing contract matrix is the single maintained owner +- **WHEN** maintained tests assert behavior for `parse_ref`, DML URI canonicalization, or revision-form resolution +- **THEN** those assertions are implemented in the centralized parsing contract matrix suite instead of being repeated across unrelated workflow contract files + +#### Scenario: Workflow contracts avoid duplicate parsing assertions +- **WHEN** a workflow contract test validates delegation, state transitions, or side-effect invariants +- **THEN** it uses canonical valid inputs and does not re-assert grammar-level parsing variants already covered by the parsing matrix + +### Requirement: Parsing matrix cases include canonical contract IDs and explicit case labels +The centralized parsing matrix SHALL encode each case with direct canonical contract IDs and readable case labels in parameterized IDs. + +#### Scenario: Parameterized parsing case includes direct canonical ID +- **WHEN** a parsing behavior case is defined via parameterization +- **THEN** the case `id=` includes a direct literal canonical contract ID and a human-readable case label + +#### Scenario: Parsing case failures remain traceable +- **WHEN** a parsing matrix case fails +- **THEN** the failing node identifier includes both the contract ID and case label needed to identify the exact parsing form boundary + +### Requirement: Revision-form matrix covers accepted and rejected local resolution boundaries +The centralized parsing matrix SHALL cover the accepted revision forms and local-only rejection boundaries required by commit/project revision resolution behavior, including file-backed `HEAD` semantics. + +#### Scenario: Accepted revision forms resolve with expected classification +- **WHEN** the suite evaluates accepted revision forms (branch, tag, ancestry expression, direct commit id, explicit commit ref, and `HEAD` backed by `.dml/HEAD`) +- **THEN** each form resolves to the expected classification and commit target for the fixture setup + +#### Scenario: Detached HEAD ancestry resolves from HEAD file +- **WHEN** `.dml/HEAD` contains a detached commit payload and the suite evaluates `HEAD~1` +- **THEN** resolution walks ancestry from the detached commit stored in `.dml/HEAD` + +#### Scenario: Unfetched remote revision form fails with local-resolution boundary +- **WHEN** a `dml://...#` revision form is evaluated without corresponding local tracking state +- **THEN** resolution fails with the documented local-resolution boundary error indicating fetch is required diff --git a/openspec/specs/runtime-execution-records/spec.md b/openspec/specs/runtime-execution-records/spec.md new file mode 100644 index 0000000..828eb2a --- /dev/null +++ b/openspec/specs/runtime-execution-records/spec.md @@ -0,0 +1,154 @@ +### Requirement: Runtime SHALL separate cache identity from execution identity +The runtime SHALL treat `cache_key` as the stable computation identity and `execution_id` as the stable identity of one execution attempt. The runtime SHALL acquire execution coordination locks by `cache_key` for launch, resume, and cancellation, SHALL propagate `execution_id` in the adapter envelope, and SHALL use execution id as the identity for dependency edges, execution state objects, and invalidation records. + +#### Scenario: First launch creates a new execution identity +- **WHEN** `start_fn` observes a cache miss and confirms there is no active execution for the computed `cache_key` +- **THEN** it creates a new `execution_id` for that launch attempt +- **AND** it invokes the adapter with both `cache_key` and `execution_id` + +#### Scenario: Resume preserves the current execution identity +- **WHEN** `start_fn` observes an active execution for a `cache_key` +- **THEN** it SHALL reuse the referenced `execution_id` +- **AND** it SHALL NOT create a new `execution_id` for that execution while resuming it + +#### Scenario: Cancellation resolves lock identity from the execution record +- **WHEN** cancellation targets execution `e1` +- **AND** `exec/state/e1.json` records `cache_key = "ck1"` +- **THEN** the runtime SHALL acquire the execution coordination lock for `ck1` +- **AND** it SHALL continue to use `e1` as the execution-record and dependency-graph identity + +### Requirement: Runtime SHALL maintain an active execution pointer per cache key +The runtime SHALL persist the currently active execution for a `cache_key` at `active/` as plain text containing only the `execution_id`. + +#### Scenario: Active pointer is created for a new running execution +- **WHEN** the first adapter call for a new execution returns `running` +- **THEN** the runtime SHALL create `active/` containing that execution's `execution_id` + +#### Scenario: Stale active pointer is discarded +- **WHEN** `active/` exists but `exec/state/.json` does not exist +- **THEN** the runtime SHALL delete `active/` +- **AND** it SHALL treat the cache key as having no active execution + +### Requirement: Runtime SHALL maintain one mutable execution record per execution id +The runtime SHALL persist one mutable lifecycle object per execution id as `execution_record`, separate from caller-owned `launch_state`. `execution_record` SHALL include `execution_id`, `cache_key`, `lifecycle`, `updated_at`, `spawned_execution_ids`, and `cancellation_requested_by`, where `cancellation_requested_by` is `str | null`. `lifecycle` SHALL be one of `running`, `cancel-pending`, `cancel-detached`, `succeeded`, or `failed`. `spawned_execution_ids` SHALL be the deduped set of child execution ids started by that execution for cancellation traversal. `execution_record` updates SHALL use compare-and-swap with the latest known ETag. If a compare-and-swap update observes ETag drift, the runtime SHALL reread the record and SHALL raise cancellation interruption only when the reread lifecycle is already a `cancel-*` value; otherwise it SHALL continue from the latest valid reread state. + +The same `execution_record` schema SHALL also be used for each live index id. For index-root records, the object path SHALL be `exec/state/.json`, `execution_id` SHALL equal the `index_id`, `cache_key` SHALL equal the `index_id`, and `spawned_execution_ids` SHALL track the deduped set of execution ids started from that index. + +The `execution_record` schema SHALL be: + +- `execution_id: str` +- `cache_key: str` +- `lifecycle: "running" | "cancel-pending" | "cancel-detached" | "succeeded" | "failed"` +- `updated_at: int` +- `spawned_execution_ids: list[str]` +- `cancellation_requested_by: str | null` + +#### Scenario: Index creation creates the initial execution record +- **WHEN** `IndexOps.create` initializes a new runtime root +- **THEN** it SHALL create an `execution_record` for that root before execution starts +- **AND** that record SHALL use `execution_id = index_id` and `cache_key = index_id` + +#### Scenario: Lifecycle record does not store resume state +- **WHEN** the runtime persists `execution_record` for execution `e0` +- **THEN** it SHALL NOT store adapter resume state in that object +- **AND** resume state SHALL instead live only in caller-owned `launch_state` + +#### Scenario: CAS reread continues on non-cancellation drift +- **WHEN** a compare-and-swap update for `execution_record` observes an ETag conflict +- **AND** the reread lifecycle is `running`, `succeeded`, or `failed` +- **THEN** the runtime SHALL continue from the reread record instead of raising cancellation interruption + +#### Scenario: CAS reread raises on cancellation lifecycle drift +- **WHEN** a compare-and-swap update for `execution_record` observes an ETag conflict +- **AND** the reread lifecycle is `cancel-pending` or `cancel-detached` +- **THEN** the runtime SHALL surface cancellation interruption rather than continuing normal execution updates + +#### Scenario: Root record accumulates spawned execution ids +- **WHEN** index `idx1` starts execution `e1` +- **THEN** the runtime SHALL update `exec/state/idx1.json` so that `spawned_execution_ids` contains `e1` + +### Requirement: Cache refs SHALL remain proper refs and record execution ids +The runtime SHALL publish `refs/cache/.json` as a normal cache ref to the current manifest for that cache key, and that ref SHALL also record `execution_id` for the current execution. Readers that materialize cached results SHALL continue resolving the cached manifest through the ref target, and graph planners SHALL read `execution_id` from the same cache ref. + +#### Scenario: Successful execution updates cache pointer +- **WHEN** execution `e7` becomes the terminal cached result for cache key `ck1` +- **THEN** the runtime SHALL write `refs/cache/ck1.json` with `execution_id = "e7"` +- **AND** that object SHALL remain a valid cache ref with its manifest `target` + +#### Scenario: Re-run requires prior invalidation +- **WHEN** a later execution `e8` attempts to publish a terminal cached result for cache key `ck1` +- **AND** `refs/cache/ck1.json` already exists for an earlier execution +- **THEN** the runtime SHALL reject that cache publication +- **AND** the earlier cache ref MUST be invalidated or deleted before `e8` can publish `refs/cache/ck1.json` + +### Requirement: Adapter envelope and result schema SHALL follow the runtime-owned execution contract +The adapter envelope SHALL include `argv_ptr`, `cache_key`, `execution_id`, `remote`, `runnable`, `state`, `execution_status`, and `cancel_requested_by`. The adapter result SHALL use only `running`, `succeeded`, `failed`, or `cancel-detached` statuses. `running` MUST include durable `state`. `succeeded` MUST include `dag_id`. `failed` MUST include `error`. `cancel-detached` MUST identify a successful cancellation update that detached runtime ownership and MAY omit durable execution output. + +#### Scenario: First adapter call uses null state +- **WHEN** the runtime invokes an adapter for a new execution +- **THEN** the adapter envelope SHALL include `state = null` + +#### Scenario: Cancel update includes renamed cancellation lifecycle +- **WHEN** the runtime invokes an adapter for a cancel update +- **THEN** the adapter envelope SHALL include `execution_status = "cancel-pending"` +- **AND** it SHALL include `cancel_requested_by` + +#### Scenario: Cancel update may return detached status +- **WHEN** an executor completes a cancel update successfully +- **THEN** the adapter result MAY use `status = "cancel-detached"` + +#### Scenario: Pending is rejected +- **WHEN** an adapter returns `pending` +- **THEN** the runtime SHALL reject that result as invalid adapter output + +### Requirement: Stale lock recovery SHALL preserve active execution ownership +The runtime SHALL use the lock for `cache_key` only to coordinate mutation of the active execution. If a lock is stale and an active execution record exists, the runtime SHALL recover the lock and resume that execution instead of creating a new one. + +#### Scenario: Stale lock with active execution resumes existing execution +- **WHEN** the lock for a `cache_key` is stale and `active/` points to an existing execution record +- **THEN** the runtime SHALL recover the lock +- **AND** it SHALL resume the existing `execution_id` +- **AND** it SHALL NOT launch a duplicate execution + +### Requirement: Failed execution SHALL be cached as a terminal result +If an adapter returns `failed`, the runtime SHALL complete the DAG with the error and SHALL publish that failed terminal outcome to cache for the `cache_key`. + +#### Scenario: Failed adapter result populates cache +- **WHEN** an adapter returns `failed` for a cache key +- **THEN** the runtime SHALL complete the DAG with the reported error +- **AND** it SHALL publish the failed outcome into cache for that cache key + +#### Scenario: Failed execution clears active pointer +- **WHEN** an active execution returns `failed` +- **THEN** the runtime SHALL delete `active/` before surfacing the failure + +### Requirement: Runtime SHALL separate caller-owned launch state from runtime-owned lifecycle state +The runtime SHALL treat `launch_state` as caller-owned state for launch and resume, and `execution_record` as execution-runtime-owned state for lifecycle, spawned execution summaries, and cancellation metadata. The caller runtime MAY transition a callee `execution_record` only to `cancel-pending` or `cancel-detached` during orphan-triggered cancellation, and SHALL NOT otherwise mutate lifecycle state owned by the callee execution runtime. + +#### Scenario: Caller runtime owns launch state updates +- **WHEN** `start_fn` launches or resumes execution `e1` +- **THEN** the caller runtime SHALL be the only path that creates or updates `launch_state` for `e1` + +#### Scenario: Execution runtime owns terminal lifecycle publication +- **WHEN** execution `e1` reaches `succeeded` or `failed` +- **THEN** the execution runtime for `e1` SHALL publish that terminal lifecycle in `execution_record` +- **AND** caller runtimes SHALL NOT publish those terminal lifecycle values for `e1` + +### Requirement: Cancellation-detached lifecycle SHALL describe runtime detachment, not backend completion +`cancel-detached` SHALL mean that the runtime completed its cancellation responsibilities for that execution, removed current-execution ownership by clearing `active/`, and delegated any remaining backend shutdown handling to the adapter or executor contract. `cancel-detached` SHALL NOT mean that external cleanup has fully completed or that the rooted execution graph is fully cancelled. + +#### Scenario: Detached lifecycle permits fresh relaunch +- **WHEN** execution `e1` is marked `cancel-detached` +- **THEN** the runtime SHALL allow a future caller for the same `cache_key` to create a new execution attempt + +#### Scenario: Detached lifecycle does not prove backend exit +- **WHEN** execution `e1` is marked `cancel-detached` +- **THEN** callers SHALL NOT infer that all external resources for `e1` have already terminated + +### Requirement: Best-effort cancellation traversal MAY stop at terminal intermediates +The runtime SHALL perform cancellation traversal from `spawned_execution_ids` on a best-effort basis. If a descendant execution is reachable only through an already-terminal intermediate runtime that is not reconstructed, the runtime MAY leave that descendant running. + +#### Scenario: Terminal intermediate prevents deeper cancellation traversal +- **WHEN** execution `A` spawned `B`, `B` spawned `C`, and `B` is already terminal before `A` is cancelled +- **THEN** the runtime MAY cancel `A` without cancelling `C` +- **AND** that outcome SHALL be treated as an accepted limitation of best-effort cancellation diff --git a/openspec/specs/shared-internal-configuration/spec.md b/openspec/specs/shared-internal-configuration/spec.md new file mode 100644 index 0000000..28e9220 --- /dev/null +++ b/openspec/specs/shared-internal-configuration/spec.md @@ -0,0 +1,113 @@ +### Requirement: API and CLI use one shared internal configuration model +The system SHALL resolve configuration through one canonical internal configuration model owned by `_internal`. Both `daggerml.api` and the CLI SHALL use that shared internal resolver rather than maintaining frontend-specific configuration semantics. + +#### Scenario: API and CLI share resolution behavior +- **WHEN** API code and CLI code resolve the same explicit values, environment variables, and config-file inputs +- **THEN** they produce the same resolved internal configuration for the underlying operation + +#### Scenario: Frontends remain thin bindings +- **WHEN** a frontend prepares to invoke shared internal operations +- **THEN** it delegates configuration precedence, validation, and derivation to shared internal configuration code instead of re-implementing those rules locally + +### Requirement: One resolver supports `project/runtime` and `global` scopes +The system SHALL expose one shared internal resolver that supports `project/runtime` and `global` scopes. Both scopes MUST use the same precedence model, but they load different config-file layers according to scope. + +#### Scenario: Project scope loads project and global config layers +- **WHEN** configuration is resolved in `project/runtime` scope +- **THEN** the resolver applies `explicit > environment variables > project config > global config > defaults` + +#### Scenario: Global scope omits project config +- **WHEN** configuration is resolved in `global` scope +- **THEN** the resolver applies `explicit > environment variables > global config > defaults` without requiring a project config file + +### Requirement: Canonical config parameters are reduced to one normalized set +The system SHALL normalize supported configuration inputs into the canonical internal parameters `project.home`, `remote.project`, `db.path`, `remote.root`, `user`, `default_branch`, `hooks.post-init`, `hooks.post-clone`, and `config_home`. + +#### Scenario: Branch context is not a canonical config parameter +- **WHEN** project configuration is resolved +- **THEN** the canonical internal model does not include a separate branch-selection parameter and does not derive the active checkout branch from configuration + +#### Scenario: Legacy overlapping remote parameters are not canonical +- **WHEN** remote-backed configuration is resolved +- **THEN** the canonical remote parameter is `remote.root` rather than separate `remote.bucket` or `remote.prefix` parameters + +### Requirement: Multiple config sources normalize into the shared internal model +The system SHALL treat explicit arguments, environment variables, project-local config, and global config as sources that feed the shared internal configuration model. Source-specific loading may differ, but normalization and precedence MUST be centralized in the shared internal resolver. + +#### Scenario: Project-local and global config feed shared resolution +- **WHEN** a frontend resolves configuration for an operation in a project directory +- **THEN** project-local `.dml/config.toml` and any applicable global config inputs are loaded as sources for the same shared internal resolution path + +#### Scenario: Environment values are normalized centrally +- **WHEN** configuration is resolved from environment variables +- **THEN** the shared internal resolver, not the frontend, maps those values into the canonical internal configuration model + +#### Scenario: Init project layout creation delegates to shared internal helper +- **WHEN** the shared `Dml` init/bootstrap workflow must create missing project layout artifacts for a local project +- **THEN** it delegates filesystem bootstrap work to shared internal project-layout helper logic instead of duplicating directory and config-file writes across orchestration helpers + +#### Scenario: Init resolves explicit options through shared resolver +- **WHEN** a caller provides init-time options for project/runtime configuration +- **THEN** the shared `Dml` init/bootstrap workflow resolves them through the shared internal resolver before mutating project state + +### Requirement: Project URI is normalized and exposes helper accessors +The system SHALL normalize and canonicalize local `remote.project` as an optional branchless project identity through shared revision URI utilities. Resolved configuration SHALL treat checkout state as repository state owned by `.dml/HEAD` rather than as a selector embedded in config. + +#### Scenario: Local project URI remains branchless when configured +- **WHEN** `remote.project` is resolved for local project configuration +- **THEN** shared configuration preserves canonical branchless form `dml:///` + +#### Scenario: Local project configuration may omit project URI +- **WHEN** local project configuration omits `remote.project` +- **THEN** shared configuration resolves successfully without deriving project identity from other inputs + +#### Scenario: Tag or branch selector is not accepted for local project config +- **WHEN** local project configuration provides `remote.project` with a branch or tag selector +- **THEN** configuration resolution fails instead of translating that selector into checkout state + +#### Scenario: Project helper accessors do not expose current checkout branch +- **WHEN** resolved configuration includes `remote.project` +- **THEN** helper accessors expose project identity only and do not treat config as the source of the active branch or detached commit + +### Requirement: DB path can be overridden but defaults from project home +The system SHALL resolve `db.path` with the same precedence as other `project/runtime` parameters, and when no higher-precedence value is provided it SHALL default to `/.dml/db/`. + +#### Scenario: Explicit DB path overrides dynamic default +- **WHEN** `db.path` is provided explicitly or through `DML_DB_PATH` +- **THEN** the resolved config uses that DB path instead of deriving it from `project.home` + +#### Scenario: DB path defaults from project home +- **WHEN** `db.path` is not provided and resolved config includes `project.home` +- **THEN** `db.path` resolves to `/.dml/db/` + +### Requirement: CLI limitations caused by serialization are documented, not treated as config divergence +The system SHALL document only those public `Dml` workflows that remain unavailable in the CLI because their public parameter types cannot be generated faithfully from command-line input. These omissions MUST NOT create a separate CLI-specific configuration model. + +#### Scenario: Unsupported public parameter types remain API-only +- **WHEN** a public workflow exposes parameter types that the CLI generator cannot represent cleanly +- **THEN** the documentation identifies that workflow as unavailable in the CLI while preserving the shared internal configuration model for supported operations + +#### Scenario: CLI-generatable public workflows are not excluded for historical reasons +- **WHEN** a public workflow uses only CLI-generatable parameter types +- **THEN** the CLI exposes that workflow instead of treating it as API-only based on prior manual CLI limitations + +#### Scenario: Missing CLI feature does not imply different config rules +- **WHEN** a capability is supported by both API and CLI +- **THEN** both frontends use the same shared internal configuration rules for that capability + +### Requirement: CLI explicit override names mirror canonical config parameters +The CLI SHALL name explicit configuration override flags after the canonical parameters they populate in the shared internal resolver whenever those parameters are exposed directly to users. + +#### Scenario: Project-home flag maps to canonical parameter +- **WHEN** the CLI resolves an explicit local project path override +- **THEN** it reads that value from a flag named after `project.home` +- **AND** it forwards the value into shared resolution as `project.home` + +#### Scenario: Remote-root flag maps to canonical parameter +- **WHEN** the CLI resolves an explicit remote project override +- **THEN** it reads that value from a flag named after `remote.root` +- **AND** it forwards the value into shared resolution as `remote.root` + +#### Scenario: Existing canonical names remain unchanged +- **WHEN** the CLI exposes other explicit config-shaped overrides such as `--remote-project` or `--config-home` +- **THEN** those flags continue using the established canonical names rather than introducing alternate aliases diff --git a/openspec/specs/supervisor-cloudwatch-streaming/spec.md b/openspec/specs/supervisor-cloudwatch-streaming/spec.md new file mode 100644 index 0000000..11fa80f --- /dev/null +++ b/openspec/specs/supervisor-cloudwatch-streaming/spec.md @@ -0,0 +1,47 @@ +### Requirement: Supervisor streams worker stdout and stderr to CloudWatch Logs +The supervisor SHALL stream worker `stdout` and `stderr` to AWS CloudWatch Logs while the worker process is still running, in addition to preserving local `stdout.log` and `stderr.log` files. + +#### Scenario: Stdout is streamed while the worker runs +- **WHEN** the supervisor starts a worker that writes to `stdout` +- **THEN** the supervisor writes the output to the local `stdout.log` file and publishes the same output to CloudWatch Logs before the worker exits + +#### Scenario: Stderr is streamed while the worker runs +- **WHEN** the supervisor starts a worker that writes to `stderr` +- **THEN** the supervisor writes the output to the local `stderr.log` file and publishes the same output to CloudWatch Logs before the worker exits + +### Requirement: Supervisor uses fixed CloudWatch log destinations per run +The supervisor SHALL publish worker logs to log group `dml` and SHALL use exactly two log streams named `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr` for the corresponding worker output channels. + +#### Scenario: Stdout stream name is derived from cache key +- **WHEN** the supervisor launches a worker for a given `cache_key` +- **THEN** worker `stdout` events are published to CloudWatch log stream `/run/{cache_key}/stdout` in log group `dml` + +#### Scenario: Stderr stream name is derived from cache key +- **WHEN** the supervisor launches a worker for a given `cache_key` +- **THEN** worker `stderr` events are published to CloudWatch log stream `/run/{cache_key}/stderr` in log group `dml` + +#### Scenario: Supervisor does not rewrite stream names +- **WHEN** the supervisor computes CloudWatch stream names from `cache_key` +- **THEN** it uses the exact names `/run/{cache_key}/stdout` and `/run/{cache_key}/stderr` without a compatibility alias or sanitization shim + +### Requirement: Supervisor emits lifecycle metadata at stream start and end +The supervisor SHALL emit a lifecycle event to each CloudWatch log stream when streaming begins and another lifecycle event when streaming ends. Lifecycle events SHALL include `execution_id`, `cache_key`, the stream kind (`stdout` or `stderr`), and the terminal status when streaming ends. + +#### Scenario: Start lifecycle event is emitted before worker output +- **WHEN** the supervisor initializes CloudWatch streaming for a worker output channel +- **THEN** it first publishes a lifecycle event containing the execution metadata for that channel before publishing worker output events + +#### Scenario: End lifecycle event is emitted after worker exit +- **WHEN** the worker process has exited and the supervisor has determined the terminal result +- **THEN** it publishes a lifecycle event containing the execution metadata and terminal status for each channel before closing CloudWatch streaming + +### Requirement: CloudWatch failures do not fail worker execution +CloudWatch client, log-stream, or event-delivery failures SHALL be non-fatal to execution. When CloudWatch streaming fails, the supervisor SHALL continue capturing worker output locally and SHALL continue evaluating the worker terminal result using the existing supervisor result contract. + +#### Scenario: CloudWatch initialization fails +- **WHEN** the supervisor cannot initialize CloudWatch logging for a worker output channel +- **THEN** the supervisor continues the worker run, preserves local log-file capture, and still returns the worker terminal result normally + +#### Scenario: CloudWatch delivery fails after streaming has started +- **WHEN** CloudWatch event delivery fails during an active worker run +- **THEN** the supervisor continues capturing output locally for the rest of the run and still returns the worker terminal result normally diff --git a/openspec/specs/test-contract-matrix/spec.md b/openspec/specs/test-contract-matrix/spec.md new file mode 100644 index 0000000..95aa75e --- /dev/null +++ b/openspec/specs/test-contract-matrix/spec.md @@ -0,0 +1,90 @@ +## Purpose +Define contract-testing taxonomy, canonical identifier practices, and migration guardrails so maintained tests stay non-duplicative, traceable, and fast-path friendly. + +## Requirements + +### Requirement: Contract-first test taxonomy +The repository SHALL organize maintained tests by contract intent with distinct locations for fast invariant checks and integration behavior. + +#### Scenario: Fast contract tests live under contracts taxonomy +- **WHEN** a test verifies a documented contract or invariant in isolation +- **THEN** it is placed under `tests/contracts/` + +#### Scenario: Integration tests live under integration taxonomy +- **WHEN** a test exercises multi-component behavior, external processes, remote roundtrips, or runtime orchestration +- **THEN** it is placed under `tests/integration/` + +### Requirement: Canonical contract IDs are embedded directly in test identifiers +Each maintained contract-focused test SHALL include a canonical contract ID expressed as a direct literal string in test naming surfaces. + +#### Scenario: Parameterized lifecycle case includes canonical ID +- **WHEN** a test case is defined in `pytest.mark.parametrize` +- **THEN** the case `id=` string includes the canonical contract ID followed by a human-readable case label + +#### Scenario: Canonical IDs avoid indirection +- **WHEN** a test references a canonical contract ID +- **THEN** the ID is specified directly in the test or parameterized case definition and does not require a shared ID registry indirection + +### Requirement: Lifecycle coverage uses parameterized stage matrices +Lifecycle-oriented contracts SHALL be tested with parameterized cases that explicitly represent each lifecycle stage. + +#### Scenario: Lifecycle stages are represented as explicit parameterized cases +- **WHEN** a contract family spans kickoff, resume/poll, and terminal behavior +- **THEN** one parameterized test defines stage-specific cases with distinct IDs and assertions for each stage + +#### Scenario: Stage-specific failures identify contract and stage +- **WHEN** a lifecycle parameterized case fails +- **THEN** the failure node identifier includes both canonical contract ID and stage label + +### Requirement: Integration tests are marked slow +Integration tests SHALL be marked `@pytest.mark.slow` so they can be excluded from quick local runs. + +#### Scenario: Integration test carries slow marker +- **WHEN** a test resides in the integration taxonomy or otherwise requires integration-level runtime behavior +- **THEN** the test is marked `@pytest.mark.slow` + +#### Scenario: Fast test selection excludes integration tests +- **WHEN** contributors run `pytest -m "not slow"` +- **THEN** tests marked `slow` are excluded and the remaining suite represents the fast-path contract checks + +### Requirement: Legacy test suite is fully migrated and superseded tests are removed +The repository SHALL complete migration of maintained tests to the contract matrix setup and SHALL remove superseded legacy tests to avoid duplicate maintenance. + +#### Scenario: Superseded legacy tests are removed after parity +- **WHEN** a legacy test's contract coverage is represented by migrated contract-matrix tests +- **THEN** the legacy test is removed from maintained test paths + +#### Scenario: End state contains only maintained tests aligned to taxonomy +- **WHEN** migration is complete +- **THEN** maintained tests conform to taxonomy, canonical ID, lifecycle parameterization, and slow-marker requirements defined in this specification + +#### Scenario: Redundant parser smoke tests are removed once equivalent arg-level coverage exists +- **WHEN** a parser-creation smoke test duplicates parser argument assertions already maintained in the same suite +- **THEN** the redundant parser-creation smoke test is removed after parity verification + +#### Scenario: Duplicate revision parsing checks are removed after central matrix adoption +- **WHEN** revision/ref/URI parsing forms are covered by the centralized parsing contract matrix +- **THEN** duplicate parsing checks in workflow-oriented contract tests are removed and workflow tests remain focused on operational invariants + +#### Scenario: External-process orchestration tests are classified as slow +- **WHEN** a test requires subprocess execution, adapter polling loops, remote roundtrips, or equivalent runtime orchestration +- **THEN** the test is marked `slow` and excluded from `pytest -m "not slow"` selection + +#### Scenario: Expensive adapter-path duplicates are collapsed into parameterized matrices +- **WHEN** multiple maintained tests exercise the same adapter-path contract family with near-identical setup and assertions +- **THEN** they are consolidated into one parameterized matrix suite that preserves canonical contract IDs and behavior-stage traceability + +### Requirement: Migration ledger governs parity and removal +The repository SHALL track migration progress in a ledger that maps canonical contract IDs from legacy tests to migrated tests and records parity evidence before legacy removal. + +#### Scenario: Batch plan records concrete suite order and risk +- **WHEN** migration planning is established +- **THEN** the ledger records bounded batch order with risk levels and exit criteria for each batch + +#### Scenario: Contract mapping is explicit for each migrated suite +- **WHEN** a suite is selected for migration +- **THEN** the ledger records canonical contract IDs and old/new test file mappings for that suite + +#### Scenario: Legacy test removal requires parity evidence +- **WHEN** a legacy suite is proposed for removal +- **THEN** the ledger includes passing evidence for targeted migrated suites, `pytest -m "not slow"`, and full `pytest` prior to removal diff --git a/openspec/specs/thin-cli-routing/spec.md b/openspec/specs/thin-cli-routing/spec.md new file mode 100644 index 0000000..a94a998 --- /dev/null +++ b/openspec/specs/thin-cli-routing/spec.md @@ -0,0 +1,26 @@ +# thin-cli-routing Specification + +## Purpose +TBD - created by archiving change thin-cli-git-ops-routing. Update Purpose after archive. +## Requirements +### Requirement: CLI project commands delegate to a single Dml workflow method +The `dml` CLI project command handlers SHALL remain thin adapters that parse command arguments and invoke exactly one workflow entrypoint per command path. + +#### Scenario: Fetch delegates through Dml +- **WHEN** a user runs `dml fetch [branch]` +- **THEN** the CLI handler parses inputs and calls one shared `Dml` fetch workflow method that performs remote synchronization behavior + +#### Scenario: Checkout delegates through Dml +- **WHEN** a user runs `dml checkout ` +- **THEN** the CLI handler parses the revision and calls one shared `Dml` checkout workflow method that returns attached/detached result details + +#### Scenario: Merge delegates through Dml +- **WHEN** a user runs `dml merge --head --user ` +- **THEN** the CLI handler calls one shared `Dml` merge workflow method and does not instantiate commit or remote ops directly + +### Requirement: CLI does not own git-like project business logic +The `_cli` layer SHALL NOT contain git-like project orchestration logic that coordinates repository state, commit resolution, or remote protocol execution. + +#### Scenario: Project logic relocation +- **WHEN** git-like project command behavior requires cross-subsystem coordination +- **THEN** the implementation resides in the shared `Dml` workflow layer and the internal ops it invokes, while CLI code remains argument parsing and result forwarding only diff --git a/openspec/specs/unified-dml-surface/spec.md b/openspec/specs/unified-dml-surface/spec.md new file mode 100644 index 0000000..c9d3ce3 --- /dev/null +++ b/openspec/specs/unified-dml-surface/spec.md @@ -0,0 +1,219 @@ +## Purpose +Define the canonical shared `_internal.Dml` boundary, its fixed caller-facing surface, and the delegation constraints between that surface and lower-level ops/config-resolution helpers. + +## Requirements + +### Requirement: One shared `_internal.Dml` class is the canonical orchestration boundary +The system SHALL expose one shared `_internal.Dml` class for repository, DAG, admin, and runtime workflows. + +#### Scenario: CLI delegates through shared Dml +- **WHEN** a CLI command executes a repository, DAG, admin, or runtime workflow +- **THEN** the handler instantiates or receives a `Dml` instance and delegates through that class instead of orchestrating lower-level ops classes directly + +#### Scenario: API wrappers delegate through shared Dml +- **WHEN** `Dag` or `Node` wrappers need repository/runtime behavior +- **THEN** they delegate through the shared internal `Dml` implementation, whether by direct use or by a thin compatibility wrapper in `daggerml.api` + +### Requirement: `Dml` delegates fuzzy and config resolution to dedicated submodules +The shared `Dml` class SHALL remain the sole caller-facing boundary for fuzzy selector and config-derived context behavior, but it SHALL farm fuzzy selector resolution to a dedicated fuzzy-resolution submodule and config-derived context lookup to a dedicated config submodule. + +#### Scenario: Revision parsing delegates to fuzzy-resolution submodule +- **WHEN** a caller passes a supported revision string to a `Dml` repository method +- **THEN** `Dml` delegates the fuzzy parsing and resolution step to the fuzzy-resolution submodule before invoking lower-level ops + +#### Scenario: Current head and remote context delegate to config submodule +- **WHEN** a `Dml` workflow needs current head state, default branch behavior, or remote-uri context +- **THEN** `Dml` obtains that config-derived context through the config submodule before invoking lower-level ops + +### Requirement: Shared `Dml` constructor uses root runtime override inputs +The shared `Dml` constructor SHALL accept the root runtime override inputs already threaded through callers for project-home, remote-uri, user, and config-home context. + +#### Scenario: CLI globals map directly to constructor +- **WHEN** a caller provides explicit project-home, remote-uri, user, or config-home runtime overrides +- **THEN** those values can be passed directly to the shared `Dml` constructor without a separate caller-specific context adapter + +### Requirement: Shared `Dml` exact DB object contracts use `Ref` +The shared `Dml` surface SHALL require `Ref` objects for caller inputs that represent exact DB-backed objects, and it SHALL return `Ref` objects as the canonical identity for DB-backed objects in its payloads. + +#### Scenario: Exact DAG access requires `Ref` +- **WHEN** a caller invokes a `Dml` method whose contract is to dereference an exact DAG object +- **THEN** the method requires a `Ref` +- **AND** it does not accept a plain `"dag:..."` string as a substitute + +#### Scenario: Exact node access requires `Ref` +- **WHEN** a caller invokes a `Dml` method whose contract is to dereference an exact node object +- **THEN** the method requires a `Ref` +- **AND** it does not accept a plain `"node:..."` string as a substitute + +#### Scenario: Non-DB selectors remain strings +- **WHEN** a caller provides a revision selector, DAG name, node name, branch, tag, remote URI, or `index_id` +- **THEN** the shared `Dml` surface continues to accept that value as a string + +#### Scenario: DB-backed payloads use ref identity +- **WHEN** a shared `Dml` payload includes the identity of a commit, DAG, node, or other DB-backed object +- **THEN** that identity is represented by `Ref` +- **AND** the payload does not duplicate the same DB identity as a separate raw `id` string + +### Requirement: Shared `Dml` exposes the fixed method namespaces +The shared `Dml` class SHALL expose this caller-facing method surface: + +- top level: `status`, `show`, `log`, `diff`, `checkout`, `branch`, `fetch`, `pull`, `push`, `merge`, `revert` +- `dag`: `list`, `get`, `checkout`, `delete` +- `admin.index`: `list`, `get`, `delete` +- `admin.cache`: `invalidate` +- `admin.remote`: `list`, `gc` +- `admin`: `gc` +- `runtime`: `create`, `describe`, `put_literal`, `put_import`, `start_fn`, `cancel`, `commit` +- `config`: `get`, `set`, `show` +- `ops`: `commit`, `head`, `dag`, `node`, `index`, `cache`, `remote`, `gc`, `config` + +#### Scenario: Top-level repository methods are present +- **WHEN** a caller inspects the shared `Dml` class +- **THEN** the repository porcelain workflows are available on the top level rather than through raw subsystem factories + +#### Scenario: DAG, admin, runtime, and config methods remain namespaced +- **WHEN** a caller needs DAG inspection, admin maintenance, runtime staging behavior, or config access +- **THEN** the shared `Dml` exposes those methods under `dag`, `admin`, `runtime`, and `config` namespaces respectively + +#### Scenario: Runtime namespace exposes cancel +- **WHEN** a caller needs to cancel work rooted at an index +- **THEN** the shared `Dml` exposes that workflow as `dml.runtime.cancel(index_id)` + +#### Scenario: Exact subsystem objects are grouped under ops +- **WHEN** a caller needs direct exact-input subsystem behavior such as `CommitOps`, `HeadOps`, or `IndexOps` +- **THEN** the shared `Dml` exposes those objects under `dml.ops.*` rather than as direct top-level `Dml` attributes + +### Requirement: Shared `Dml` surface SHALL be introspection-ready +The shared `Dml` boundary and its public namespaces SHALL expose runtime documentation that explains class purpose, method behavior, and parameter meaning without changing workflow semantics, and that metadata SHALL be sufficient for generated CLI help. + +#### Scenario: Namespace objects describe their purpose +- **WHEN** a caller inspects `Dml` or any namespace reachable through `dml.config`, `dml.runtime`, `dml.dag`, or `dml.admin` +- **THEN** the class exposes a docstring that describes the purpose of that boundary or namespace + +#### Scenario: Public methods describe behavior +- **WHEN** a caller inspects a public top-level or namespaced `Dml` method +- **THEN** the method exposes a docstring that describes the operation behavior and any notable constraints or side effects + +#### Scenario: Generated CLI help can use runtime docs +- **WHEN** the CLI generator inspects `Dml` or one of its public namespace methods +- **THEN** it can derive command descriptions and parameter help from runtime docstrings and annotation metadata without a separate command-specific help registry + +### Requirement: Shared `Dml` parameters SHALL expose machine-readable help metadata +Public parameters on the shared `Dml` surface and its public namespace methods SHALL use `typing.Annotated` metadata to describe parameter meaning, while Python signature defaults remain the source of truth for default values. + +#### Scenario: Parameter meaning is available from annotations +- **WHEN** a caller inspects annotations for a public `Dml` method or a public method on a `Dml` namespace object with extras included +- **THEN** the parameter annotations include `Annotated` metadata that describes what each user-facing parameter means + +#### Scenario: Defaults remain in the signature +- **WHEN** a public `Dml` or namespaced method has a defaulted parameter +- **THEN** the default value remains represented by the Python signature +- **AND** the `Annotated` metadata does not become the source of truth for that default + +#### Scenario: Ambiguous selector parameters may include examples +- **WHEN** a public `Dml` parameter accepts potentially confusing selector or URI forms such as revision selectors or remote project identifiers +- **THEN** the `Annotated` metadata MAY include concise examples that clarify accepted forms without redefining the underlying grammar + +#### Scenario: Non-generatable CLI parameters are not part of the public method surface +- **WHEN** a public workflow depends on helper state that cannot be generated from CLI input such as an S3 client object +- **THEN** that helper state is provided through `Dml` instance construction or private instance state rather than through a public method parameter + +### Requirement: `Dml` stores runtime context, S3 client state, and temporary-directory bookkeeping +The shared `Dml` class SHALL keep only `_context`, `_s3_client`, and `_tempdirs` as private instance attributes. Helper behavior that supports `Dml` public methods SHALL live in module-level functions within `daggerml._internal.dml` rather than in private `Dml` instance methods. + +#### Scenario: Namespace and helper access do not require extra Dml instance fields +- **WHEN** a caller uses any public namespace on `Dml` +- **THEN** the namespace behavior is derived from `_context`, `_s3_client`, `_tempdirs`, and delegated helper logic without introducing additional private `Dml` instance attributes + +#### Scenario: Dml public workflows do not depend on private helper methods +- **WHEN** a `Dml` repository, runtime, DAG, admin, or config workflow needs helper behavior such as ops dispatch, payload shaping, or revision binding +- **THEN** that helper behavior executes through module-level functions in `daggerml._internal.dml` rather than through `Dml._...` instance methods + +#### Scenario: Namespace objects keep only Dml as private state +- **WHEN** a caller inspects the namespace objects exposed by `Dml` +- **THEN** each namespace object keeps only `._dml` as private instance state +- **AND** namespace helper behavior does not rely on additional private attrs or private helper methods on the namespace object + +#### Scenario: Remote sync workflows reuse the Dml-owned S3 client +- **WHEN** a caller invokes `dml.fetch`, `dml.pull`, or `dml.push` +- **THEN** the workflow uses the `Dml` instance's private `_s3_client` instead of requiring a public `s3_client` method parameter + +### Requirement: `Dml` is the only fuzzy-selector boundary +The shared `Dml` class SHALL accept fuzzy selector strings only for workflows whose contract is lookup or repository navigation, and it SHALL require exact `Ref` objects for workflows whose contract is direct dereference or mutation of DB-backed objects. + +#### Scenario: Revision selector resolves inside Dml +- **WHEN** a caller passes a supported revision string such as `HEAD~1` to a shared `Dml` repository method +- **THEN** the `Dml` method resolves it through the fuzzy-resolution submodule and lower-level ops receive only exact values + +#### Scenario: DAG-name lookup resolves inside Dml +- **WHEN** a caller passes a DAG name to a shared `Dml` lookup workflow that documents name-based selection +- **THEN** the shared `Dml` method performs that selector resolution through the fuzzy-resolution submodule and lower-level ops do not parse that caller-facing form + +#### Scenario: Exact DB-object workflow rejects fuzzy string grammar +- **WHEN** a caller passes a ref-like string such as `dag:abc123`, `node-literal:abc123`, or `commit:abc123` to a shared `Dml` workflow whose contract is for an exact DB-backed object +- **THEN** the method fails rather than coercing that string into a `Ref` + +#### Scenario: Unsupported fuzzy grammar is rejected at Dml boundary +- **WHEN** a caller passes a selector form that is not documented by the redesigned CLI contracts +- **THEN** the shared `Dml` method fails rather than inventing additional grammar + +### Requirement: Lower-level ops classes accept resolved values only +Lower-level ops classes used by `Dml` SHALL accept exact refs, exact branch names, exact ids, and other resolved repository values rather than caller-facing fuzzy selectors or config-shaped overrides. + +#### Scenario: Commit workflow uses exact values below Dml +- **WHEN** a shared `Dml` method invokes commit/head workflow behavior +- **THEN** the lower-level ops calls receive already-resolved commits, branches, or ids instead of revision grammar strings + +### Requirement: `Dml` delegates repository behavior to the relevant ops classes +The shared `Dml` class SHALL orchestrate workflows by delegating repository actions to the relevant subsystem ops classes rather than re-implementing those mechanics inline. Module-level helper functions in `daggerml._internal.dml` SHALL construct the owning concrete ops classes directly and SHALL NOT route calls through a facade object or string-dispatch proxy layer. + +#### Scenario: Commit-oriented workflow delegates to CommitOps +- **WHEN** a caller invokes `dml.show`, `dml.log`, `dml.diff`, `dml.merge`, or `dml.revert` +- **THEN** `Dml` delegates the relevant repository operations to `CommitOps` after preparing resolved inputs + +#### Scenario: Runtime workflow delegates to IndexOps +- **WHEN** a caller invokes `dml.runtime.create`, `dml.runtime.put_literal`, `dml.runtime.start_fn`, or `dml.runtime.commit` +- **THEN** `Dml` delegates the relevant repository operations to `IndexOps` after preparing resolved inputs + +#### Scenario: Admin workflow delegates to the owning subsystem +- **WHEN** a caller invokes an admin cache, remote, or gc workflow +- **THEN** `Dml` delegates the repository action to `CacheOps`, `RemoteOps`, or `GcOps` respectively after preparing resolved inputs + +#### Scenario: Helper construction instantiates concrete ops directly +- **WHEN** a shared `Dml` workflow needs an ops object such as `CommitOps`, `HeadOps`, `IndexOps`, or `RemoteOps` +- **THEN** the helper logic in `daggerml._internal.dml` constructs that concrete ops class directly against the active DB handle +- **AND** it does not dispatch through a `DmlOps` facade or `_OpsProxy`-style string factory + +### Requirement: Shared `Dml` returns JSON-ready payloads +Shared `Dml` methods SHALL return JSON-ready dict/list payloads for container structure, while allowing typed leaves such as `Ref`, `Uri`, `Error`, and `Runnable`. + +#### Scenario: CLI-ready result shape comes from Dml +- **WHEN** a caller invokes a shared `Dml` repository or admin workflow +- **THEN** the returned payload is ready for JSON serialization without CLI-owned result reshaping beyond standard typed-leaf encoding + +### Requirement: Repository bootstrap and recovery are available through shared `Dml` +Repository bootstrap and recovery workflows SHALL be available through the shared `Dml` boundary. + +#### Scenario: Init and recovery use Dml-owned entrypoint +- **WHEN** a caller invokes repository bootstrap or recovery behavior +- **THEN** the workflow executes through a `Dml` entrypoint and preserves the documented config-first recovery semantics + +### Requirement: Direct user cancellation SHALL use configured user identity +When `dml.runtime.cancel(index_id)` is invoked without an active runtime execution context, the workflow SHALL still proceed as an out-of-band cancellation operation. In that case, the runtime SHALL record `cancellation_requested_by` from the configured user identity. + +#### Scenario: User-triggered cancel records configured user without active execution +- **WHEN** a user directly invokes `dml.runtime.cancel("idx1")` +- **AND** there is no active caller `execution_id` +- **THEN** the runtime SHALL set `cancellation_requested_by` to `config.user` + +#### Scenario: Missing configured user still fails cancel +- **WHEN** a user invokes `dml.runtime.cancel("idx1")` +- **AND** no configured user identity is available +- **THEN** the runtime SHALL fail the request rather than persisting an empty cancellation requester + +### Requirement: Runtime cancellation SHALL be out-of-band control-plane behavior +`dml.runtime.cancel(index_id)` SHALL operate as an out-of-band control-plane workflow rather than as a continuation of a running execution. The workflow SHALL freeze the target index, remove caller-owned live edges, orphan eligible callees, and request detached cancellation without requiring an active caller execution context. + +#### Scenario: Direct cancel freezes index before cancellation traversal +- **WHEN** a user invokes `dml.runtime.cancel("idx1")` +- **THEN** the runtime SHALL freeze the index before removing live caller edges or requesting callee cancellation diff --git a/pyproject.toml b/pyproject.toml index 2c94cbb..d50db93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["hatchling", "hatch-vcs"] -build-backend = "hatchling.build" +requires = ["scikit-build-core>=0.11", "setuptools-scm>=8", "Cython>=3.0"] +build-backend = "scikit_build_core.build" [project] name = "daggerml" @@ -10,31 +10,50 @@ authors = [ ] description = "DaggerML" readme = "README.md" -requires-python = ">=3.9" -license = "MIT" +requires-python = ">=3.10" +license = {text = "MIT"} keywords = [] classifiers = [ - "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = [] +dependencies = [ + "typing-extensions>=4.0.0; python_version < '3.11'", + "boto3", +] [project.optional-dependencies] -cli = ["daggerml-cli>=0.0.38"] -dev = [ - "pytest", - "pytest-cov", - "moto[all,server]", +table = [ + "tabulate", + "asciidag", ] +[project.scripts] +dml = "daggerml._cli:cli" +dml-local-adapter = "daggerml.contrib.adapters:LocalAdapter.cli" +dml-lambda-adapter = "daggerml.contrib.adapters:LambdaAdapter.cli" + +[project.entry-points."daggerml.contrib.adapters"] +local = "daggerml.contrib.adapters:LocalAdapter" +lambda = "daggerml.contrib.adapters:LambdaAdapter" + +[project.entry-points."daggerml.contrib.executors"] +batch = "daggerml.contrib.executors:BatchExecutor" +script = "daggerml.contrib.executors:ScriptExecutor" +docker = "daggerml.contrib.executors:DockerExecutor" +ssh = "daggerml.contrib.executors:SshExecutor" +cfn = "daggerml.contrib.executors:CfnExecutor" + +[project.entry-points."daggerml.codecs"] +dataframe = "daggerml.contrib.codecs:literal_codecs" +builtin = "daggerml.codecs:codecs" [project.urls] Homepage = "https://daggerml.com" @@ -43,10 +62,24 @@ Issues = "https://github.com/daggerml/python-lib/issues" Source = "https://github.com/daggerml/python-lib" License = "https://github.com/daggerml/python-lib/blob/master/LICENSE" +[tool.scikit-build] +wheel.packages = ["src/daggerml"] +metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" +sdist.include = ["src/daggerml/__about__.py"] + +[tool.setuptools_scm] +write_to = "src/daggerml/__about__.py" +tag_regex = "^v(?P[0-9]+(?:\\.[0-9]+)*[a-zA-Z0-9\\.\\-]*)$" + +[[tool.scikit-build.generate]] +path = "daggerml/__about__.py" +location = "install" +template = "__version__ = \"${version}\"\n" + [tool.pytest.ini_options] pythonpath = "tests/" minversion = "6.0" -addopts = "-ra --ignore=submodules/" +addopts = "-ra --import-mode=importlib" testpaths = [ "tests", "src/daggerml", @@ -56,47 +89,12 @@ markers = [ "serial", ] -[tool.hatch.version] -source = "vcs" - -[tool.hatch.version.raw-options] -version_scheme = "only-version" # use tag as version -local_scheme = "no-local-version" # pypi does not support local-schemes -tag_regex = "^v(?P[0-9]+\\.[0-9]+\\.[0-9]+(?:[-\\.][a-zA-Z0-9]+)*)$" -version_file = "src/daggerml/__about__.py" # write version info to this file -relative_to = "{root:uri}" - -[tool.hatch.build.targets.wheel] -packages = ["src/daggerml"] -artifacts = [ - "src/daggerml/__about__.py" # add to hatch build because it's git ignored -] - -[tool.hatch.envs.default] -python="3.10" -features = ["dev"] -dependencies = ["twine"] -pre-install-commands = [ - "pip install -e {root:uri}/submodules/daggerml_cli", -] - -[tool.hatch.envs.default.scripts] -test = 'pytest . {args}' -coverage-report = 'pytest --cov-report term-missing --cov=daggerml tests/' -dml-build = "hatch build && twine upload -r {args:testpypi} dist/*" - -[tool.hatch.envs.test-all] -template = "default" - -[[tool.hatch.envs.test-all.matrix]] -python = ["3.9", "3.10", "3.11", "3.12", "3.13"] - [tool.ruff] -target-version = "py39" +target-version = "py313" line-length = 120 [tool.ruff.lint] -select = ["E", "F", "B", "I", "PGH004"] +select = ["E", "F", "B", "I", "PGH004", "W"] ignore = [] unfixable = ["B"] @@ -107,7 +105,8 @@ known-first-party = ["daggerml"] ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] -"__init__.py" = ["F401", "E402"] +"src/daggerml/__init__.py" = ["F401", "E402"] +"src/daggerml/_internal/__init__.py" = ["F401", "E402"] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] @@ -132,10 +131,23 @@ exclude_lines = [ [tool.pyright] include = ["src"] +exclude = ["tests"] defineConstant = { DEBUG = true } typeCheckingMode = "basic" reportGeneralTypeIssues = false reportMissingImports = "error" +reportMissingModuleSource = false reportMissingTypeStubs = false reportOptionalCall = false reportOptionalMemberAccess = false + +[dependency-groups] +dev = [ + "hypothesis>=6.141.1", + "moto[all,server]>=5.1.20,<5.2.0", # max version because moto s3 bug + "polars>=1.39.0", + "pyright>=1.1.409", + "pytest>=8.4.2", + "pytest-cov>=7.0.0", + "ruff>=0.14.10", +] diff --git a/src/daggerml/__init__.py b/src/daggerml/__init__.py index a1ab66a..701fe08 100644 --- a/src/daggerml/__init__.py +++ b/src/daggerml/__init__.py @@ -5,12 +5,43 @@ with strong typing support and a context-manager based interface. """ -from daggerml.core import Dag, Dml, Error, Executable, Node, Resource +from daggerml.api import ( + Dag, + Dml, + Error, + Node, + Ref, + Runnable, + Uri, + clear_default_dml, + get_default_dml, + load, + new, + set_default_dml, + status, + temporary, + use_default_dml, +) try: from daggerml.__about__ import __version__ except ImportError: __version__ = "local" - -__all__ = ("Dag", "Dml", "Error", "Executable", "Node", "Resource") +__all__ = ( + "Dag", + "Dml", + "Error", + "Node", + "Ref", + "Uri", + "Runnable", + "get_default_dml", + "set_default_dml", + "use_default_dml", + "clear_default_dml", + "new", + "load", + "status", + "temporary", +) diff --git a/src/daggerml/_cli.py b/src/daggerml/_cli.py new file mode 100644 index 0000000..3387796 --- /dev/null +++ b/src/daggerml/_cli.py @@ -0,0 +1,376 @@ +from __future__ import annotations + +import argparse +import inspect +import json +import logging +import sys +import types +import typing +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Annotated, Any, Callable, Literal, Union, cast, get_args, get_origin + +from daggerml._internal import Dml, Ref, Runnable, Uri + + +class PrettyArgumentParser(argparse.ArgumentParser): + def error(self, message: str) -> None: + self.print_usage(sys.stderr) + self.exit(2, f"error: {message}\n") + + +@dataclass(frozen=True) +class _Target: + path: tuple[str, ...] + method_name: str + kind: Literal["instance", "classmethod"] + + +class MethodCLI: + """ + Turn a class with methods and nested namespace objects into an argparse CLI. + + - Accepts a class constructor, not a class instance. + - Root args/options/flags come from the class constructor signature. + - A root -v / -vv / -vvv flag configures logging level. + - The class is instantiated after root args are parsed. + - Public instance methods become commands. + - Public annotated namespace attributes become namespaces. + - Required method parameters become positional args. + - Constructor parameters are root options. + - Parameters with defaults become options/flags. + - Command and option names use kebab-case. + - Positional arg names remain snake_case. + - bool kwargs are always flags. + - default False/None: --foo + - default True: --no-foo + - Annotated[T, "help"] provides per-argument help. + - Docstrings provide command help and optional parameter help. + - Results are printed to stdout as JSON. + """ + + def __init__( + self, + cls: type[Any], + *, + prog: str | None = None, + parsers: dict[Any, Callable[[str], Any]] | None = None, + ) -> None: + if not isinstance(cls, type): + raise TypeError("MethodCLI expects a class, not an instance") + self.cls = cls + self.parsers = dict(parsers or {}) + self.parser = PrettyArgumentParser( + prog=prog or self._kebab(cls.__name__), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + self.parser.add_argument( + "-v", + dest="_verbosity", + action="count", + default=0, + help="Increase logging verbosity. Use -v, -vv, or -vvv.", + ) + self._add_constructor_args(self.parser) + self._build_namespace_from_type(cls, self.parser, path=()) + + def main(self, argv: list[str] | None = None) -> int: + try: + return self.run(argv) + except KeyboardInterrupt: + print("error: interrupted", file=sys.stderr) + return 130 + except Exception as exc: + logging.exception("command failed") + print(f"error: {exc}", file=sys.stderr) + return 1 + + def run(self, argv: list[str] | None = None) -> int: + ns = self.parser.parse_args(argv) + data = vars(ns) + verbosity = data.pop("_verbosity", 0) + self._configure_logging(verbosity) + target = data.pop("_target", None) + if target is None: + self.parser.error("missing command") + init_kwargs = { + name: data.pop(f"_init_{name}") for name in self._constructor_param_names() if f"_init_{name}" in data + } + method_kwargs = {k: v for k, v in data.items() if not k.startswith("_subcommand_")} + target = cast(_Target, target) + root = self.cls if target.kind == "classmethod" else self.cls(**init_kwargs) + method = self._resolve_method(root, target) + result = method(**method_kwargs) + print(json.dumps(result, indent=2, sort_keys=True, default=self._json_default)) + return 0 + + def _configure_logging(self, verbosity: int) -> None: + level = logging.WARNING if verbosity <= 0 else logging.INFO if verbosity == 1 else logging.DEBUG + logging.basicConfig(level=level, format="%(levelname)s: %(message)s", force=True) + + def _add_constructor_args(self, parser: argparse.ArgumentParser) -> None: + init = self.cls.__init__ + doc = self._parse_docstring(inspect.getdoc(init) or inspect.getdoc(self.cls) or "") + group = parser.add_argument_group("constructor arguments") + self._add_callable_args( + group, init, doc.param_help, dest_prefix="_init_", skip_self=True, required_as_options=True + ) + + def _constructor_param_names(self) -> list[str]: + sig = inspect.signature(self.cls.__init__) + return [ + name + for name, param in sig.parameters.items() + if name != "self" and param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD) + ] + + def _build_namespace_from_type( + self, typ: type[Any], parser: argparse.ArgumentParser, path: tuple[str, ...] + ) -> None: + subparsers = parser.add_subparsers(dest="_subcommand_" + "_".join(path or ("root",)), required=True) + for name, member in sorted(vars(typ).items()): + if self._is_public_method_descriptor(name, member): + command_name = self._kebab(name) + doc = self._parse_docstring(inspect.getdoc(member) or "") + child = subparsers.add_parser( + command_name, + help=doc.summary, + description=doc.description or doc.summary, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + child.set_defaults(_target=_Target(path=path, method_name=name, kind="instance")) + self._add_callable_args(child, member, doc.param_help, skip_self=True, required_as_options=False) + elif not path and self._is_public_root_classmethod_descriptor(name, member): + command_name = self._kebab(name) + method = getattr(typ, name) + doc = self._parse_docstring(inspect.getdoc(method) or "") + child = subparsers.add_parser( + command_name, + help=doc.summary, + description=doc.description or doc.summary, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + child.set_defaults(_target=_Target(path=path, method_name=name, kind="classmethod")) + self._add_callable_args(child, method, doc.param_help, skip_self=False, required_as_options=False) + for name, namespace_type in sorted(self._namespace_types_for(typ).items()): + child = subparsers.add_parser( + self._kebab(name), + help=f"{name} commands", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + self._build_namespace_from_type(namespace_type, child, path + (name,)) + + def _namespace_types_for(self, typ: type[Any]) -> dict[str, type[Any]]: + out: dict[str, type[Any]] = {} + for name, member in vars(typ).items(): + if name.startswith("_"): + continue + if not isinstance(member, property): + continue + fget = member.fget + if fget is None: + continue + try: + prop_hints = typing.get_type_hints(fget, include_extras=True) + except Exception: + prop_hints = getattr(fget, "__annotations__", {}) + ret = prop_hints.get("return") + if ret is None: + continue + base, _ = self._split_annotated(ret) + if isinstance(base, type) and self._looks_like_namespace_type(base): + out[name] = base + return out + + def _looks_like_namespace_type(self, typ: type[Any]) -> bool: + scalar_types = (str, bytes, int, float, bool, list, tuple, dict, set, Path) + if typ in scalar_types or issubclass(typ, Enum): + return False + return any(self._is_public_method_descriptor(n, m) for n, m in vars(typ).items()) + + def _add_callable_args( + self, + parser_or_group: argparse.ArgumentParser | argparse._ArgumentGroup, + fn: Callable[..., Any], + doc_help: dict[str, str], + *, + dest_prefix: str = "", + skip_self: bool, + required_as_options: bool, + ) -> None: + sig = inspect.signature(fn) + hints = typing.get_type_hints(fn, include_extras=True) + for name, param in sig.parameters.items(): + if skip_self and name == "self": + continue + if param.kind in (param.VAR_POSITIONAL, param.VAR_KEYWORD): + raise TypeError(f"{fn.__qualname__}: *args and **kwargs are not supported") + typ, annotated_help = self._split_annotated(hints.get(name, param.annotation)) + help_text = annotated_help or doc_help.get(name) or self._type_display(typ) + has_default = param.default is not inspect._empty + default = None if not has_default else param.default + dest = f"{dest_prefix}{name}" + if self._is_bool_type(typ): + if not has_default: + raise TypeError(f"{fn.__qualname__}.{name}: bool parameters must have defaults") + self._add_bool_flag(parser_or_group, name, dest, default, help_text) + continue + converter, extra = self._parser_for(typ) + if has_default: + kwargs: dict[str, Any] = {"dest": dest, "default": default, "help": help_text, **extra} + if converter is not None: + kwargs["type"] = converter + parser_or_group.add_argument(f"--{self._kebab(name)}", **kwargs) + elif required_as_options: + kwargs = {"dest": dest, "required": True, "help": help_text, **extra} + if converter is not None: + kwargs["type"] = converter + parser_or_group.add_argument(f"--{self._kebab(name)}", **kwargs) + else: + kwargs = {"help": help_text, **extra} + if converter is not None: + kwargs["type"] = converter + parser_or_group.add_argument(name, **kwargs) + + def _add_bool_flag( + self, + parser_or_group: argparse.ArgumentParser | argparse._ArgumentGroup, + name: str, + dest: str, + default: Any, + help_text: str, + ) -> None: + kebab = self._kebab(name) + if default is True: + parser_or_group.add_argument(f"--no-{kebab}", dest=dest, action="store_false", default=True, help=help_text) + else: + parser_or_group.add_argument( + f"--{kebab}", dest=dest, action="store_true", default=bool(default), help=help_text + ) + + def _resolve_method(self, root: Any, target: _Target) -> Callable[..., Any]: + obj = root + for name in target.path: + obj = getattr(obj, name) + return getattr(obj, target.method_name) + + def _parser_for(self, typ: Any) -> tuple[Callable[[str], Any] | None, dict[str, Any]]: + typ, _ = self._unwrap_optional(typ) + if typ in self.parsers: + return self.parsers[typ], {} + origin = get_origin(typ) + args = get_args(typ) + if origin is Literal: + choices = list(args) + parser = type(choices[0]) if choices else str + return parser, {"choices": choices} + if typ in {Ref, Uri}: + return lambda s: typ(s), {} + if typ in (str, int, float): + return typ, {} + if origin in (list, dict): + return self._json_parser(typ), {} + return str, {} + + def _json_parser(self, typ: Any) -> Callable[[str], Any]: + def parse(value: str) -> Any: + try: + return json.loads(value) + except json.JSONDecodeError as exc: + raise argparse.ArgumentTypeError(f"expected JSON for {self._type_display(typ)}: {exc.msg}") from exc + + return parse + + def _split_annotated(self, typ: Any) -> tuple[Any, str | None]: + if get_origin(typ) is Annotated: + base, *metadata = get_args(typ) + for item in metadata: + if isinstance(item, str): + return base, item + return base, None + return typ, None + + def _unwrap_optional(self, typ: Any) -> tuple[Any, bool]: + origin = get_origin(typ) + args = get_args(typ) + if origin in (Union, types.UnionType) and type(None) in args: + rest = tuple(a for a in args if a is not type(None)) + if len(rest) == 1: + return rest[0], True + return typ, False + + def _is_bool_type(self, typ: Any) -> bool: + typ, _ = self._unwrap_optional(typ) + return typ is bool + + def _is_public_method_descriptor(self, name: str, member: Any) -> bool: + if name.startswith("_"): + return False + if isinstance(member, (staticmethod, classmethod)): + return False + return inspect.isfunction(member) + + def _is_public_root_classmethod_descriptor(self, name: str, member: Any) -> bool: + return not name.startswith("_") and isinstance(member, classmethod) + + def _json_default(self, obj: Any) -> Any: + if isinstance(obj, Ref): + return str(obj.to) + if isinstance(obj, Uri): + return obj.uri + if isinstance(obj, Runnable): + raise NotImplementedError("Runnable objects cannot be serialized to JSON directly") + raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + + def _kebab(self, name: str) -> str: + return name.replace("_", "-") + + def _type_display(self, typ: Any) -> str: + if typ is inspect._empty: + return "value" + return str(typ).replace("typing.", "") + + @dataclass + class _Doc: + summary: str | None + description: str | None + param_help: dict[str, str] + + def _parse_docstring(self, doc: str) -> _Doc: + if not doc: + return self._Doc(None, None, {}) + lines = doc.splitlines() + summary = lines[0].strip() or None + param_help: dict[str, str] = {} + desc_lines: list[str] = [] + in_args = False + current: str | None = None + for raw in lines[1:]: + stripped = raw.strip() + if stripped in {"Args:", "Arguments:", "Parameters:"}: + in_args = True + current = None + continue + if in_args: + if not stripped: + continue + if not raw.startswith((" ", "\t")): + in_args = False + current = None + else: + if ":" in stripped: + key, text = stripped.split(":", 1) + current = key.strip() + param_help[current] = text.strip() + elif current: + param_help[current] = (param_help[current] + " " + stripped).strip() + continue + if not in_args and stripped: + desc_lines.append(stripped) + return self._Doc(summary, "\n".join(desc_lines).strip() or None, param_help) + + +def cli() -> None: + raise SystemExit(MethodCLI(Dml, prog="dml").main()) diff --git a/src/daggerml/_internal/__init__.py b/src/daggerml/_internal/__init__.py new file mode 100644 index 0000000..e53d48a --- /dev/null +++ b/src/daggerml/_internal/__init__.py @@ -0,0 +1,28 @@ +"""DML Repository Native Implementation.""" + +from __future__ import annotations + +from daggerml._internal._db import DmlDbInvalidPathError, DmlDbInvalidRefError, Ref +from daggerml._internal.dml import Dml +from daggerml._internal.exec_state import CancelledExecutionError, ExecutionState +from daggerml._internal.execution_context import execution_context +from daggerml._internal.types import ( + DmlRepoError, + Error, + Runnable, + Uri, +) + +__all__ = ( + "Dml", + "DmlDbInvalidPathError", + "DmlDbInvalidRefError", + "DmlRepoError", + "Error", + "CancelledExecutionError", + "ExecutionState", + "Ref", + "Runnable", + "Uri", + "execution_context", +) diff --git a/src/daggerml/_internal/_db.pyi b/src/daggerml/_internal/_db.pyi new file mode 100644 index 0000000..8babaaa --- /dev/null +++ b/src/daggerml/_internal/_db.pyi @@ -0,0 +1,86 @@ +from __future__ import annotations + +from collections.abc import Iterator +from contextlib import AbstractContextManager +from typing import Any, Literal, Self, overload + +class DmlDbError(Exception): ... +class DmlDbInvalidHandleError(DmlDbError): ... +class DmlDbClosedError(DmlDbError): ... +class DmlDbForkedError(DmlDbError): ... +class DmlDbInvalidTxnError(DmlDbError): ... +class DmlDbReadonlyTxnError(DmlDbError): ... +class DmlDbForkedTxnError(DmlDbError): ... +class DmlDbInvalidInputError(ValueError, DmlDbError): ... +class DmlDbInvalidTypeError(ValueError, DmlDbError): ... +class DmlDbInvalidPathError(ValueError, DmlDbError): ... +class DmlDbInvalidRefError(ValueError, DmlDbError): ... +class DmlDbInvalidNamespaceError(ValueError, DmlDbError): ... +class DmlDbKeyNotFoundError(DmlDbError): ... +class DmlDbKeyExistsError(DmlDbError): ... +class DmlDbMsgpackError(DmlDbError): ... +class DmlDbOutOfMemoryError(MemoryError, DmlDbError): ... +class DmlDbMapFullError(DmlDbError): ... +class DmlDbBusyError(DmlDbError): ... +class DmlDbLmdbError(DmlDbError): ... +class DmlDbInternalError(DmlDbError): ... +class DmlDbEnvReopenedError(DmlDbError): ... + +class Ref: + to: str + + def __init__(self, to: str) -> None: ... + def __repr__(self) -> str: ... + def __lt__(self, other: Ref) -> bool: ... + def __le__(self, other: Ref) -> bool: ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... + def __gt__(self, other: Ref) -> bool: ... + def __ge__(self, other: Ref) -> bool: ... + def __hash__(self) -> int: ... + def ns(self) -> str: ... + def id(self) -> str: ... + def nss(self) -> list[str]: ... + +class DmlDbEnv: + path: str + namespaces: tuple[str, ...] + + @property + def closed(self) -> bool: ... + @classmethod + def _open( + cls, path: str, namespaces: list[str], create_if_missing: bool = False, map_size: int | None = None + ) -> Self: ... + @classmethod + def create(cls, path: str, namespaces: list[str], map_size: int | None = None) -> Self: ... + @classmethod + def open(cls, path: str, namespaces: list[str], map_size: int | None = None) -> Self: ... + def get_size(self) -> int: ... + def resize(self, new_size: int) -> None: ... + def tx(self, readonly: bool = True) -> AbstractContextManager[DmlDbEnvTxn]: ... + def close(self) -> None: ... + +class DmlDbEnvTxn: + @property + def closed(self) -> bool: ... + def commit(self) -> None: ... + def abort(self) -> None: ... + def put( + self, + value: object, + *, + ns: str | None = None, + to: Ref | None = None, + no_overwrite: bool = False, + raw: bool = False, + ) -> Ref: ... + @overload + def get(self, key: Ref, raw: Literal[False] = False) -> dict: ... + @overload + def get(self, key: Ref, raw: Literal[True]) -> str: ... + def get(self, key: Ref, raw: bool = False) -> dict | str: ... + def delete(self, key: Ref) -> None: ... + def exists(self, key: Ref) -> bool: ... + def iter(self, ns: str, start_token: Any = None) -> Iterator[tuple[Ref, object]]: ... + def list_orphans(self, start: list[Ref]) -> list[Ref]: ... diff --git a/src/daggerml/_internal/_db.pyx b/src/daggerml/_internal/_db.pyx new file mode 100644 index 0000000..0e2b4b6 --- /dev/null +++ b/src/daggerml/_internal/_db.pyx @@ -0,0 +1,1300 @@ +# cython: language_level=3 +""" +Standalone Cython extension module for database operations. + +This module wraps the underlying C library (dml_db) that handles: +- LMDB database operations +- Data serialization/deserialization +- Reference management +- Transaction handling +""" +import logging +import os +import sys +import threading +import base64 +from contextlib import contextmanager +from libc.stdlib cimport malloc, free, calloc +from libc.string cimport strlen, memcpy + +from cpython.bytes cimport PyBytes_AsStringAndSize, PyBytes_FromStringAndSize +from cpython.float cimport PyFloat_AsDouble +from cpython.long cimport PyLong_AsLongLongAndOverflow +from cpython.unicode cimport PyUnicode_AsUTF8, PyUnicode_AsUTF8AndSize, PyUnicode_DecodeUTF8 +from cpython.exc cimport PyErr_Clear, PyErr_Occurred + + +logger = logging.getLogger(__name__) +MAX_STRING_BYTES = 1024 * 1024 +MAX_COLLECTION_LEN = 100000 + +cdef extern from "dml_value.h": + ctypedef enum DmlValueType: + DML_VALUE_NULL + DML_VALUE_BOOL + DML_VALUE_INT + DML_VALUE_FLOAT + DML_VALUE_STR + DML_VALUE_LIST + DML_VALUE_MAP + DML_VALUE_REF + + ctypedef struct DmlValue + + ctypedef struct DmlMapEntry: + char *key + size_t key_len + DmlValue *value + + ctypedef struct DmlValueStr: + char *data + size_t size + + ctypedef struct DmlValueList: + DmlValue **items + size_t count + + ctypedef struct DmlValueMap: + DmlMapEntry *entries + size_t count + + ctypedef union DmlValueAs: + int boolean + long long integer + double floating + DmlValueStr str + DmlValueList list + DmlValueMap map + DmlValueStr ref + + ctypedef struct DmlValue: + DmlValueType type + DmlValueAs as + + DmlValue *dml_value_new_null() nogil + DmlValue *dml_value_new_bool(int value) nogil + DmlValue *dml_value_new_int(long long value) nogil + DmlValue *dml_value_new_float(double value) nogil + DmlValue *dml_value_new_str(const char *data, size_t size) nogil + DmlValue *dml_value_new_ref(const char *data, size_t size) nogil + DmlValue *dml_value_new_list(size_t count) nogil + int dml_value_list_set(DmlValue *list, size_t index, DmlValue *item) nogil + DmlValue *dml_value_new_map(size_t count) nogil + int dml_value_map_set(DmlValue *map, size_t index, const char *key, size_t key_len, DmlValue *value) nogil + void dml_value_free(DmlValue *value) nogil + int dml_ref_split( + const char *ref, + size_t ref_len, + const char **namespace_str, + size_t *namespace_len, + const char **id_str, + size_t *id_len + ) nogil + +cdef extern from "dml_db.h": + int DML_DB_ERR_HANDLE_INVALID + int DML_DB_ERR_HANDLE_CLOSED + int DML_DB_ERR_HANDLE_FORKED + int DML_DB_ERR_TXN_INVALID + int DML_DB_ERR_TXN_READONLY + int DML_DB_ERR_TXN_FORKED + int DML_DB_ERR_INPUT_INVALID + int DML_DB_ERR_TYPE_INVALID + int DML_DB_ERR_PATH_INVALID + int DML_DB_ERR_REF_INVALID + int DML_DB_ERR_NAMESPACE_INVALID + int DML_DB_ERR_NOT_FOUND + int DML_DB_ERR_KEY_EXISTS + int DML_DB_ERR_MSGPACK + int DML_DB_ERR_NOMEM + int DML_DB_ERR_MAP_FULL + int DML_DB_ERR_BUSY + int DML_DB_ERR_LMDB + int DML_DB_ERR_INTERNAL + int DML_DB_ERR_ENV_REOPENED + + ctypedef struct DmlDbHandle: + pass + + ctypedef struct DmlDbTxn: + pass + + ctypedef struct DmlObjCollection: + char *keys + size_t *key_lens + DmlValue **values + size_t count + char *next_token + + int dml_db_open( + const char *path, + const char *const *namespaces, + size_t namespace_count, + const int create_if_missing, + size_t map_size, + DmlDbHandle **out_handle, + ) nogil + int dml_db_close(DmlDbHandle **p_handle) nogil + + int dml_db_mapsize(DmlDbHandle **p_handle, size_t *out_mapsize) nogil + int dml_db_resize(DmlDbHandle **p_handle, size_t mapsize) nogil + + int dml_db_txn_begin(DmlDbHandle **p_handle, const int readonly, DmlDbTxn **out_txn); + int dml_db_txn_fin(DmlDbHandle **p_handle, DmlDbTxn *txn, const int commit) nogil + + int dml_db_put( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + const DmlValue *value, + int no_overwrite, + int raw, + DmlValue **out_ref + ) nogil + int dml_db_get( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int raw, + DmlValue **out_value + ) nogil + int dml_db_del( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len + ) nogil + int dml_db_exists( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + size_t ns_len, + const char *key, + size_t key_len, + int *out_exists + ) nogil + + int dml_db_iter_keys( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *ns, + const char *start_token, + DmlObjCollection *out_page + ) nogil + void dml_db_free_obj_collection(DmlObjCollection *page) nogil + int dml_db_list_orphans( + DmlDbHandle **p_handle, + DmlDbTxn *txn, + const char *const *start_refs, + size_t start_refs_count, + DmlValue **out_refs + ) nogil + +cdef DmlValue *py_to_dml_value(object obj): + cdef DmlValue *result + cdef DmlValue *child + cdef DmlValue *data_value + cdef Py_ssize_t size + cdef const char *data + cdef Py_ssize_t i + cdef long long int_val + cdef int overflow + # keep Python-level references to temporary Unicode objects + cdef object py_str + cdef object py_to_obj + cdef object py_map_key + + if obj is None: + result = dml_value_new_null() + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, bool): + result = dml_value_new_bool(1 if obj else 0) + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, int): + overflow = 0 + int_val = PyLong_AsLongLongAndOverflow(obj, &overflow) + if overflow != 0: + raise DmlDbInvalidTypeError( + f"invalid int value for db storage: {obj!r} " + "(reason: out of range for int64)" + ) + return NULL + if PyErr_Occurred() != NULL: + PyErr_Clear() + raise DmlDbInvalidTypeError( + f"invalid int value for db storage: {obj!r} " + "(reason: conversion failed)" + ) + return NULL + result = dml_value_new_int(int_val) + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, float): + result = dml_value_new_float(PyFloat_AsDouble(obj)) + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, str): + py_str = obj + data = PyUnicode_AsUTF8AndSize(py_str, &size) + if data == NULL: + return NULL + if size > MAX_STRING_BYTES: + raise DmlDbInvalidTypeError( + f"invalid string value for db storage: {obj!r} " + f"(reason: exceeds max length {MAX_STRING_BYTES})" + ) + result = dml_value_new_str(data, size) + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, Ref): + py_to_obj = obj.to + data = PyUnicode_AsUTF8AndSize(py_to_obj, &size) + if data == NULL: + return NULL + result = dml_value_new_ref(data, size) + if result == NULL: + raise MemoryError() + return result + if isinstance(obj, (list, tuple)): + size = len(obj) + if size > MAX_COLLECTION_LEN: + raise DmlDbInvalidTypeError( + f"invalid list value for db storage: {obj!r} " + f"(reason: exceeds max length {MAX_COLLECTION_LEN})" + ) + result = dml_value_new_list(size) + if result == NULL: + raise MemoryError() + try: + for i in range(size): + child = py_to_dml_value(obj[i]) + if child == NULL: + dml_value_free(result) + raise MemoryError("Failed to convert list item") + if dml_value_list_set(result, i, child) != 0: + raise DmlDbInvalidTypeError( + f"invalid list entry for db storage: {obj[i]!r} " + "(reason: unsupported type)" + ) + return result + except Exception: + dml_value_free(result) + raise(obj) + if isinstance(obj, dict): + size = len(obj) + if size > MAX_COLLECTION_LEN: + raise DmlDbInvalidTypeError( + f"invalid dict value for db storage: {obj!r} " + f"(reason: exceeds max length {MAX_COLLECTION_LEN})" + ) + result = dml_value_new_map(size) + if result == NULL: + raise MemoryError() + i = 0 + for key, value in obj.items(): + if not isinstance(key, str): + dml_value_free(result) + raise DmlDbInvalidTypeError( + f"invalid dict key for db storage: {key!r} " + "(reason: keys must be str)" + ) + py_map_key = key + data = PyUnicode_AsUTF8AndSize(py_map_key, &size) + if data == NULL: + dml_value_free(result) + return NULL + child = py_to_dml_value(value) + if child == NULL: + dml_value_free(result) + return NULL + if dml_value_map_set(result, i, data, size, child) != 0: + dml_value_free(child) + dml_value_free(result) + raise DmlDbInvalidTypeError( + f"invalid dict value for db storage: {value!r} " + "(reason: unsupported type)" + ) + i += 1 + return result + raise DmlDbInvalidTypeError( + f"invalid value for db storage: {obj!r} " + f"(reason: unsupported type {type(obj).__name__})" + ) + return NULL + +cdef object dml_value_to_py(const DmlValue *value): + cdef size_t i + cdef object py_obj + cdef object py_key + + if value == NULL: + raise ValueError("Invalid MessagePack payload") + + if value.type == DML_VALUE_NULL: + return None + if value.type == DML_VALUE_BOOL: + return bool(value.as.boolean) + if value.type == DML_VALUE_INT: + return int(value.as.integer) + if value.type == DML_VALUE_FLOAT: + return float(value.as.floating) + if value.type == DML_VALUE_STR: + return PyUnicode_DecodeUTF8(value.as.str.data, value.as.str.size, "replace") + if value.type == DML_VALUE_REF: + py_obj = PyUnicode_DecodeUTF8(value.as.ref.data, value.as.ref.size, "strict") + return Ref(py_obj) + if value.type == DML_VALUE_LIST: + py_list = [] + for i in range(value.as.list.count): + py_list.append(dml_value_to_py(value.as.list.items[i])) + return py_list + if value.type == DML_VALUE_MAP: + py_dict = {} + for i in range(value.as.map.count): + entry = &value.as.map.entries[i] + py_key = PyUnicode_DecodeUTF8(entry.key, entry.key_len, "strict") + py_dict[py_key] = dml_value_to_py(entry.value) + return py_dict + raise TypeError("Unsupported msgpack object type") + +class DmlDbError(Exception): + """ + Base error for database operations. + + Notes + ----- + Subclasses provide specific failure details. + """ + pass + +class DmlDbInvalidHandleError(DmlDbError): + """ + Invalid database handle. + + Notes + ----- + Raised when a handle is NULL or uninitialized. + """ + pass + +class DmlDbClosedError(DmlDbError): + """ + Database handle is closed. + + Notes + ----- + Raised when operations use a closed handle. + """ + pass + +class DmlDbForkedError(DmlDbError): + """ + Database handle used after fork without reopen. + + Notes + ----- + Raised when a handle is reused across fork boundaries. + """ + pass + +class DmlDbInvalidTxnError(DmlDbError): + """ + Transaction is invalid or closed. + + Notes + ----- + Raised when a transaction pointer is NULL or closed. + """ + pass + +class DmlDbReadonlyTxnError(DmlDbError): + """ + Write attempted in a read-only transaction. + + Notes + ----- + Raised when write operations run in read-only transactions. + """ + pass + +class DmlDbForkedTxnError(DmlDbError): + """ + Transaction used after a fork. + """ + pass + +class DmlDbInvalidInputError(ValueError, DmlDbError): + """ + Invalid input supplied to a database call. + + Notes + ----- + Covers null pointers and empty strings where disallowed. + """ + pass + +class DmlDbInvalidTypeError(ValueError, DmlDbError): + """ + Input type is not representable in the database. + + Notes + ----- + Raised for Python types or values that cannot be encoded. + """ + pass + +class DmlDbInvalidPathError(ValueError, DmlDbError): + """ + Database path is invalid or inaccessible. + + Notes + ----- + Raised when the filesystem path does not exist or cannot be opened. + """ + pass + +class DmlDbInvalidRefError(ValueError, DmlDbError): + """ + Invalid reference format. + + Notes + ----- + Raised when ref parsing fails. + """ + pass + +class DmlDbInvalidNamespaceError(ValueError, DmlDbError): + """ + Namespace is not configured or allowed. + + Notes + ----- + Raised when the namespace is missing from the configured list. + """ + pass + +class DmlDbKeyNotFoundError(DmlDbError): + """ + Database key does not exist. + + Notes + ----- + Raised when a lookup fails to find a key. + """ + pass + +class DmlDbKeyExistsError(DmlDbError): + """ + Database key already exists. + + Notes + ----- + Raised when `no_overwrite` is set and the key is present. + """ + pass + +class DmlDbMsgpackError(DmlDbError): + """ + MessagePack encoding or decoding failed. + + Notes + ----- + Raised when serialization or deserialization fails. + """ + pass + +class DmlDbOutOfMemoryError(MemoryError, DmlDbError): + """ + Memory allocation failed in the database layer. + + Notes + ----- + Raised on allocation failures. + """ + pass + +class DmlDbMapFullError(DmlDbError): + """ + LMDB map is full; resize is required. + + Notes + ----- + Raised when LMDB reports a full map. + """ + pass + +class DmlDbBusyError(DmlDbError): + """ + Database is busy and could not acquire a lock. + + Notes + ----- + Raised when LMDB reports a busy/locked state. + """ + pass + +class DmlDbLmdbError(DmlDbError): + """ + Unclassified LMDB backend error. + + Notes + ----- + Raised for LMDB failures without a more specific mapping. + """ + pass + +class DmlDbInternalError(DmlDbError): + """ + Internal invariant failure in the database layer. + + Notes + ----- + Raised for unexpected internal failures. + """ + pass + +class DmlDbEnvReopenedError(DmlDbError): + """ + Database environment was reopened; transaction must be retried. + + Notes + ----- + Raised when the environment was repaired (e.g., after fork or EINVAL), + invalidating all existing transactions. Caller should retry the entire + transaction block. + """ + pass + +cdef inline object raise_if_error(int rc, str context): + cls = RuntimeError + if rc == 0: + return + elif rc == DML_DB_ERR_HANDLE_INVALID: + cls = DmlDbInvalidHandleError + prefix = "database handle is invalid" + elif rc == DML_DB_ERR_HANDLE_CLOSED: + cls = DmlDbClosedError + prefix = "database handle is closed" + elif rc == DML_DB_ERR_HANDLE_FORKED: + cls = DmlDbForkedError + prefix = "database handle used after fork; call DB.reopen()" + elif rc == DML_DB_ERR_TXN_INVALID: + cls = DmlDbInvalidTxnError + prefix = "invalid or closed transaction" + elif rc == DML_DB_ERR_TXN_READONLY: + cls = DmlDbReadonlyTxnError + prefix = "read-only transaction" + elif rc == DML_DB_ERR_TXN_FORKED: + cls = DmlDbForkedTxnError + prefix = "transaction used after fork" + elif rc == DML_DB_ERR_INPUT_INVALID: + cls = DmlDbInvalidInputError + prefix = "invalid input" + elif rc == DML_DB_ERR_TYPE_INVALID: + cls = DmlDbInvalidTypeError + prefix = "invalid input type" + elif rc == DML_DB_ERR_PATH_INVALID: + cls = DmlDbInvalidPathError + prefix = "invalid database path" + elif rc == DML_DB_ERR_REF_INVALID: + cls = DmlDbInvalidRefError + prefix = "invalid ref format" + elif rc == DML_DB_ERR_NAMESPACE_INVALID: + cls = DmlDbInvalidNamespaceError + prefix = "invalid namespace" + elif rc == DML_DB_ERR_NOT_FOUND: + cls = DmlDbKeyNotFoundError + prefix = "data not found" + elif rc == DML_DB_ERR_KEY_EXISTS: + cls = DmlDbKeyExistsError + prefix = "key already exists" + elif rc == DML_DB_ERR_MSGPACK: + cls = DmlDbMsgpackError + prefix = "msgpack serialization error" + elif rc == DML_DB_ERR_NOMEM: + cls = DmlDbOutOfMemoryError + prefix = "out of memory" + elif rc == DML_DB_ERR_MAP_FULL: + cls = DmlDbMapFullError + prefix = "database map is full" + elif rc == DML_DB_ERR_BUSY: + cls = DmlDbBusyError + prefix = "database is busy" + elif rc == DML_DB_ERR_LMDB: + cls = DmlDbLmdbError + prefix = "lmdb error" + elif rc == DML_DB_ERR_INTERNAL: + cls = DmlDbInternalError + prefix = "internal database error" + elif rc == DML_DB_ERR_ENV_REOPENED: + cls = DmlDbEnvReopenedError + prefix = "database environment was reopened; retry transaction" + else: + prefix = f"unknown database error: {rc}" + raise cls(f"{prefix}: {context}") + +cdef class Ref: + """ + Reference to another node. + + Attributes + ---------- + to + Reference target. + + Notes + ----- + `Ref` distinguishes stored references from plain strings, which is needed + so serialization can round-trip graph edges instead of raw text. + """ + cdef public object to + + def __init__(self, to): + """ + Initialize a reference wrapper. + + Parameters + ---------- + to + Reference string in `namespace/id` form. + + Raises + ------ + TypeError + If `to` is not a string. + """ + if not isinstance(to, str): + raise TypeError("to must be str") + self.to = to + + def __repr__(self): + """ + Return a helpful representation for debugging. + + Returns + ------- + str + Debug representation showing the reference target. + """ + if self.to is None: + return "Ref()" + return f"Ref({self.to})" + + def __richcmp__(self, other, int op): + """ + Compare references based on their target strings. + + Parameters + ---------- + other + Another `Ref` instance. + op + Comparison opcode provided by Cython. + + Returns + ------- + object + Comparison result or `NotImplemented` for unsupported types. + """ + if not isinstance(other, Ref): + return NotImplemented + if op == 0: return self.to < other.to + if op == 1: return self.to <= other.to + if op == 2: return self.to == other.to + if op == 3: return self.to != other.to + if op == 4: return self.to > other.to + if op == 5: return self.to >= other.to + return NotImplemented + + def __hash__(self): + """ + Hash the reference based on its target string. + + Returns + ------- + int + Hash value suitable for dict/set membership. + """ + return hash(self.to) + + def ns(self): + """ + Return the namespace portion of the reference. + + Returns + ------- + str + Namespace extracted from the reference. + + Raises + ------ + ValueError + If the reference format is invalid. + + Notes + ----- + This uses the database C parser so Python and C agree on ref structure. + """ + cdef const char *data + cdef Py_ssize_t size = 0 + cdef const char *ns = NULL + cdef const char *ident = NULL + cdef size_t ns_len = 0 + cdef size_t id_len = 0 + + data = PyUnicode_AsUTF8AndSize(self.to, &size) + if data == NULL: + return None + if dml_ref_split(data, size, &ns, &ns_len, &ident, &id_len) != 0: + raise ValueError("Invalid Ref format") + return PyUnicode_DecodeUTF8(ns, ns_len, "strict") + + def id(self): + """ + Return the identifier portion of the reference. + + Returns + ------- + str + Identifier extracted from the reference. + + Raises + ------ + ValueError + If the reference format is invalid. + + Notes + ----- + This complements `ns()` by exposing the ID while keeping the split + logic centralized in the database layer. + """ + cdef const char *data + cdef Py_ssize_t size = 0 + cdef const char *ns = NULL + cdef const char *ident = NULL + cdef size_t ns_len = 0 + cdef size_t id_len = 0 + + data = PyUnicode_AsUTF8AndSize(self.to, &size) + if data == NULL: + return None + if dml_ref_split(data, size, &ns, &ns_len, &ident, &id_len) != 0: + raise ValueError("Invalid Ref format") + return PyUnicode_DecodeUTF8(ident, id_len, "strict") + + def nss(self): + """ + Return the namespace hierarchy as a list. + + Returns + ------- + list[str] + Namespace hierarchy split by '-'. + + Raises + ------ + ValueError + If the reference format is invalid. + """ + return self.ns().split('-') + +cdef class DmlDbEnv: + """Daggerml database environment handle.""" + + cdef public str path + cdef public tuple namespaces + + cdef DmlDbHandle* _handle + cdef int _owns_handle + cdef object _lock + cdef int _active_txns + + def __cinit__(self): + self._handle = NULL + self._owns_handle = 0 + self._lock = threading.RLock() + self._active_txns = 0 + + property closed: + def __get__(self) -> bool: + return self._handle == NULL + + @classmethod + def _open(cls, str path, list[str] namespaces, bint create_if_missing=False, map_size=None): + cdef Py_ssize_t i, n = len(namespaces) + cdef const char **ns_c = NULL + cdef DmlDbHandle* _handle = NULL + cdef const char* path_c = PyUnicode_AsUTF8(path) + if path_c == NULL: + raise ValueError("Cannot unicode") + cdef size_t map_size_c = 0 + cdef int rc + cdef Py_ssize_t ns_size + cdef object py_ns + cdef const char *ns_ptr + cdef char *c_copy + cdef Py_ssize_t j + if n == 0: + raise ValueError("namespaces must be non-empty") + if map_size is not None: + if not isinstance(map_size, int): + raise TypeError("map_size must be int or None") + if map_size <= 0: + raise ValueError("map_size must be positive") + map_size_c = map_size + + try: + ns_c = calloc(n, sizeof(const char*)) + if ns_c == NULL: + raise MemoryError() + ns_size = 0 + for i in range(n): + py_ns = namespaces[i] + ns_ptr = PyUnicode_AsUTF8AndSize(py_ns, &ns_size) + if ns_ptr == NULL: + # free any allocated copies + for j in range(i): + if ns_c[j] != NULL: + free(ns_c[j]) + free(ns_c) + raise ValueError("Cannot unicode") + # allocate owned copy for the C library to use + c_copy = malloc(ns_size + 1) + if c_copy == NULL: + for j in range(i): + if ns_c[j] != NULL: + free(ns_c[j]) + free(ns_c) + raise MemoryError() + memcpy(c_copy, ns_ptr, ns_size) + c_copy[ns_size] = '\0' + ns_c[i] = c_copy + rc = dml_db_open( + path_c, + ns_c, + n, + 1 if create_if_missing else 0, + map_size_c, + &_handle + ) + raise_if_error(rc, "dml_db_open") + finally: + if ns_c != NULL: + # free copied namespace strings then the array + for i in range(n): + if ns_c[i] != NULL: + free(ns_c[i]) + free(ns_c) + + cdef DmlDbEnv obj = cls.__new__(cls) + obj.path = path + obj.namespaces = tuple(namespaces) + obj._handle = _handle + obj._owns_handle = 1 + return obj + + @classmethod + def create(cls, str path, list[str] namespaces, map_size=None): + return cls._open(path, namespaces, create_if_missing=True, map_size=map_size) + + @classmethod + def open(cls, str path, list[str] namespaces, map_size=None): + return cls._open(path, namespaces, create_if_missing=False, map_size=map_size) + + def get_size(self) -> int: + if self._handle == NULL: + raise RuntimeError("handle is closed") + cdef size_t size = 0 + cdef int rc = dml_db_mapsize(&self._handle, &size) + raise_if_error(rc, "dml_db_mapsize") + return size + + def resize(self, int new_size): + if self._handle == NULL: + raise RuntimeError("handle is closed") + cdef int rc = dml_db_resize(&self._handle, new_size) + raise_if_error(rc, "dml_db_resize") + + @contextmanager + def tx(self, readonly=True): + """Begin a transaction and yield a DmlDbTxn wrapper.""" + cdef DmlDbEnvTxn txn_obj; + cdef int rc; + with self._lock: + txn_obj = DmlDbEnvTxn.__new__(DmlDbEnvTxn) + if self._handle == NULL: + raise RuntimeError("handle is closed") + txn_obj._env = self # strong ref keeps env alive + txn_obj._txn = NULL + txn_obj._closed = 0 + rc = dml_db_txn_begin(&self._handle, 1 if readonly else 0, &txn_obj._txn) + if rc != 0: + txn_obj.abort() + raise_if_error(rc, "dml_db_txn_begin") + self._active_txns += 1 + try: + yield txn_obj + except: + txn_obj.abort() + raise + else: + txn_obj.commit() + finally: + txn_obj._txn = NULL + txn_obj._closed = 1 + self._active_txns -= 1 + + def close(self): + if self._handle == NULL: + return + if self._active_txns != 0: + raise RuntimeError("cannot close env while transactions are active") + if self._owns_handle: + dml_db_close(&self._handle) + self._handle = NULL + + def __dealloc__(self): + self.close() + + +cdef class DmlDbEnvTxn: + """Transaction wrapper. + + Holds a strong reference to the environment to prevent env close/GC while txn is live. + """ + + cdef DmlDbEnv _env + cdef DmlDbTxn* _txn + cdef int _closed + + property closed: + def __get__(self) -> bool: + return self._closed or self._txn == NULL + + cdef inline void _check(self) except *: + if self._env is None or self._env._handle == NULL: + raise RuntimeError("env is closed") + if self._txn == NULL or self._closed: + raise RuntimeError("txn is closed") + + cdef inline void finish(self, success: bool = True) except *: + cdef int rc; + with self._env._lock: + if success: + self._check() + if self._txn == NULL: + raise RuntimeError("txn is closed") + rc = dml_db_txn_fin(&self._env._handle, self._txn, 1 if success else 0) + self._txn = NULL + self._closed = 1 + if success: + raise_if_error(rc, "dml_db_txn_commit") + + def commit(self): + self.finish(success=True) + + def abort(self): + try: + self.finish(success=False) + except RuntimeError: + pass + + # --- Data operations on the txn --- + + def put(self, object value, *, str ns=None, Ref to=None, bint no_overwrite=False, bint raw=False) -> Ref: + self._check() + if to is not None and ns is not None: + raise RuntimeError("ns and to were both not None.") + if to is None and ns is None: + raise RuntimeError("Both ns and to were None.") + cdef int c_no_overwrite = 1 if no_overwrite else 0 + cdef DmlValue *out_ref = NULL + cdef const char *key_char = NULL + cdef const char *ns_char = NULL + cdef Py_ssize_t key_size = 0 + cdef Py_ssize_t ns_size = 0 + cdef int rc + cdef DmlValue *dv = NULL + cdef object py_id = None + cdef object py_ns = None + cdef const char *data + cdef Py_ssize_t size + + try: + if raw: + if not isinstance(value, str): + raise TypeError("raw=True requires value to be a base64 encoded string") + decoded_bytes = base64.b64decode(value) + if PyBytes_AsStringAndSize(decoded_bytes, &data, &size) != 0: + raise ValueError("Invalid bytes object") + dv = dml_value_new_str(data, size) + if dv == NULL: + raise MemoryError() + # keep decoded_bytes alive + _keep_decoded = decoded_bytes + else: + dv = py_to_dml_value(value) + if dv == NULL: + raise MemoryError("py_to_dml_value returned NULL") + if to is not None: + ns = to.ns() + py_id = to.id() + key_char = PyUnicode_AsUTF8AndSize(py_id, &key_size) + if key_char == NULL: + raise MemoryError("Insufficient memory") + py_ns = ns + ns_char = PyUnicode_AsUTF8AndSize(py_ns, &ns_size) + if ns_char == NULL: + raise MemoryError("Insufficient memory") + rc = dml_db_put(&self._env._handle, self._txn, + ns_char, ns_size, + key_char, key_size, + dv, c_no_overwrite, 1 if raw else 0, &out_ref) + finally: + if dv != NULL: + dml_value_free(dv) + raise_if_error(rc, "dml_db_put") + if out_ref == NULL: + raise RuntimeError("dml_db_put succeeded but out_ref is NULL") + try: + return dml_value_to_py(out_ref) + finally: + dml_value_free(out_ref) + + def get(self, Ref key, bint raw=False) -> object: + self._check() + cdef const char *key_char + cdef const char *ns_char + cdef Py_ssize_t key_size + cdef Py_ssize_t ns_size + cdef DmlValue *out_value = NULL + cdef int rc + cdef object py_id = None + cdef object py_ns = None + + try: + py_id = key.id() + key_char = PyUnicode_AsUTF8AndSize(py_id, &key_size) + if key_char == NULL: + raise MemoryError("Insufficient memory") + py_ns = key.ns() + ns_char = PyUnicode_AsUTF8AndSize(py_ns, &ns_size) + if ns_char == NULL: + raise MemoryError("Insufficient memory") + # keep python objects alive while native call uses pointers + _keep_id = py_id + _keep_ns = py_ns + + rc = dml_db_get(&self._env._handle, self._txn, + ns_char, ns_size, + key_char, key_size, + 1 if raw else 0, + &out_value) + raise_if_error(rc, f"db.get({key.to})") + + if out_value == NULL: + raise RuntimeError("dml_db_get returned NULL value") + if raw: + raw_bytes = PyBytes_FromStringAndSize(out_value.as.str.data, out_value.as.str.size) + return base64.b64encode(raw_bytes).decode('ascii') + else: + return dml_value_to_py(out_value) + finally: + if out_value != NULL: + dml_value_free(out_value) + + def delete(self, Ref key) -> None: + self._check() + cdef const char *key_char + cdef const char *ns_char + cdef Py_ssize_t key_size + cdef Py_ssize_t ns_size + cdef int rc + cdef object py_id = None + cdef object py_ns = None + + py_id = key.id() + key_char = PyUnicode_AsUTF8AndSize(py_id, &key_size) + if key_char == NULL: + raise MemoryError("Insufficient memory") + py_ns = key.ns() + ns_char = PyUnicode_AsUTF8AndSize(py_ns, &ns_size) + if ns_char == NULL: + raise MemoryError("Insufficient memory") + # keep python objects alive for duration of native call + _keep_id = py_id + _keep_ns = py_ns + + rc = dml_db_del(&self._env._handle, self._txn, + ns_char, ns_size, + key_char, key_size) + raise_if_error(rc, f"db.del({key.to})") + + def exists(self, Ref key) -> bool: + self._check() + cdef const char *key_char + cdef const char *ns_char + cdef Py_ssize_t key_size + cdef Py_ssize_t ns_size + cdef int rc + cdef int exists = 0 + + py_id = key.id() + key_char = PyUnicode_AsUTF8AndSize(py_id, &key_size) + if key_char == NULL: + raise MemoryError("Insufficient memory") + py_ns = key.ns() + ns_char = PyUnicode_AsUTF8AndSize(py_ns, &ns_size) + if ns_char == NULL: + raise MemoryError("Insufficient memory") + + rc = dml_db_exists(&self._env._handle, self._txn, + ns_char, ns_size, + key_char, key_size, + &exists) + raise_if_error(rc, f"db.exists({key.to})") + return bool(exists) + + def iter(self, str ns, start_token=None): + self._check() + cdef const char *ns_char + cdef const char *start_char = NULL + cdef DmlObjCollection page + cdef DmlValue **values + cdef char *keys_ptr + cdef Py_ssize_t key_len + cdef Py_ssize_t i + cdef int rc + cdef object token = start_token + cdef object next_token_obj + + ns_char = PyUnicode_AsUTF8(ns) + if ns_char == NULL: + raise MemoryError("Insufficient memory") + + while True: + page.keys = NULL + page.values = NULL + page.count = 0 + page.next_token = NULL + if token is not None: + start_char = PyUnicode_AsUTF8(token) + if start_char == NULL: + raise MemoryError("Insufficient memory") + else: + start_char = NULL + + rc = dml_db_iter_keys(&self._env._handle, self._txn, ns_char, start_char, &page) + raise_if_error(rc, "dml_db_iter_keys") + if page.count == 0: + break + + try: + values = page.values + keys_ptr = page.keys + py_items = [] + for i in range(page.count): + key_len = page.key_lens[i] + py_key = PyUnicode_DecodeUTF8(keys_ptr, key_len, "strict") + py_value = dml_value_to_py(values[i]) + # Do NOT free values[i] here — let the native dml_db_free_obj_collection + # own and free the native DmlValue memory. Freeing here could double-free + # if the native free path also releases these pointers. + py_items.append((py_key, py_value)) + keys_ptr += key_len + 1 + + if page.next_token != NULL: + next_token_obj = PyUnicode_DecodeUTF8( + page.next_token, + strlen(page.next_token), + "strict" + ) + else: + next_token_obj = None + finally: + dml_db_free_obj_collection(&page) + + for py_key, py_value in py_items: + yield Ref(f"{ns}:{py_key}"), py_value + + if next_token_obj is None: + break + token = next_token_obj + + def list_orphans(self, list[Ref] start) -> list[Ref]: + """ + List orphaned references starting from given roots. + + Parameters + ---------- + start : list[Ref] + List of Ref strings to start traversal from. + + Returns + ------- + list[Ref] + List of orphaned Ref objects. + """ + self._check() + cdef Py_ssize_t count = len(start) + cdef Py_ssize_t i + cdef Py_ssize_t tmp_len = 0 + cdef const char **refs = NULL + cdef DmlValue *out_refs = NULL + cdef int rc + # C-level temporaries used during building the C string array + cdef object py_ref + cdef const char *ref_ptr + cdef char *c_copy + cdef Py_ssize_t j + if count > 0: + refs = calloc(count, sizeof(const char *)) + if refs == NULL: + raise MemoryError("Insufficient memory") + for i in range(count): + py_ref = (start[i]).to + ref_ptr = PyUnicode_AsUTF8AndSize(py_ref, &tmp_len) + if ref_ptr == NULL: + free(refs) + raise MemoryError("Insufficient memory") + # allocate owned copy for native call + c_copy = malloc(tmp_len + 1) + if c_copy == NULL: + for j in range(i): + if refs[j] != NULL: + free(refs[j]) + free(refs) + raise MemoryError("Insufficient memory") + memcpy(c_copy, ref_ptr, tmp_len) + c_copy[tmp_len] = '\0' + refs[i] = c_copy + try: + rc = dml_db_list_orphans(&self._env._handle, self._txn, refs, count, &out_refs) + raise_if_error(rc, "dml_db_list_orphans") + if out_refs == NULL: + return [] + return dml_value_to_py(out_refs) + finally: + if refs != NULL: + # free copies + for i in range(count): + if refs[i] != NULL: + free(refs[i]) + free(refs) + if out_refs != NULL: + dml_value_free(out_refs) diff --git a/src/daggerml/_internal/builtins.py b/src/daggerml/_internal/builtins.py new file mode 100644 index 0000000..acfe420 --- /dev/null +++ b/src/daggerml/_internal/builtins.py @@ -0,0 +1,117 @@ +"""Built-in function registry for the DML runtime system. + +Provides the BUILTIN_FNS dictionary containing all built-in functions +that can be called during DAG execution. + +Public API: + BUILTIN_FNS - Dictionary mapping function names to implementations + +Private API: + None (this module only exports the registry) +""" + +from daggerml._internal.types import NONE, Runnable, Uri +from daggerml._internal.util import unnest + + +def assoc(xs, k, v): + """Associate a key-value pair in a dictionary.""" + if not isinstance(xs, dict): + raise TypeError(f"Cannot assoc on object of type {type(xs).__name__}, expected dict") + if not isinstance(k, str): + raise TypeError("Dictionary keys must be strings") + result = xs.copy() + result[k] = v + return result + + +def conj(xs, x): + """Conjoin an element to a list.""" + if not isinstance(xs, list): + raise TypeError(f"Cannot conj on object of type {type(xs).__name__}, expected list") + return [*xs, x] + + +def get(x, k, d=NONE): + """Get value from a list or dict with optional default.""" + if isinstance(x, list): + if d is not NONE: + raise TypeError("Default values not supported for list access") + if isinstance(k, list): + if len(k) != 2: + raise ValueError("Slice key must have exactly 2 elements [start, stop]") + if not all(isinstance(i, int) for i in k): + raise TypeError("Slice indices must be integers") + return x[slice(*k)] + if not isinstance(k, int): + raise TypeError("List indices must be integers") + return x[k] + if isinstance(x, dict): + if not isinstance(k, str): + raise TypeError(f"Dict keys must be strings but got {type(k).__name__}") + if d is NONE: + return x[k] + return x.get(k, d) + raise TypeError(f"Cannot get from object of type {type(x).__name__}, expected list or dict") + + +def contains(x, k): + """Check if container contains key/value.""" + if not isinstance(x, (list, dict)): + raise TypeError(f"Cannot check contains on object of type {type(x).__name__}, expected list or dict") + return k in x + + +def make_list(*xs): + """Create a list from arguments.""" + return list(xs) + + +def make_dict(*kvs): + """Create a dict from alternating key-value arguments.""" + if len(kvs) % 2 != 0: + raise ValueError("Dict requires an even number of arguments (key-value pairs)") + try: + return dict(zip(kvs[0::2], kvs[1::2], strict=True)) + except TypeError as e: + raise TypeError("Invalid key-value pairs for dict") from e + + +def make_uri(uri): + """Create a Uri datum value.""" + if isinstance(uri, Uri): + return uri + if not isinstance(uri, str): + raise TypeError(f"Uri requires a string, got {type(uri).__name__}") + return Uri(uri) + + +def make_runnable(target, kwargs=None, adapter="", sub=None): + """Create a runnable value.""" + if isinstance(target, str): + target = make_uri(target) + if not isinstance(target, Uri): + raise TypeError(f"Runnable target must be Uri, got {type(target).__name__}") + if kwargs is None: + kwargs = {} + if not isinstance(kwargs, dict): + raise TypeError(f"Runnable kwargs must be dict, got {type(kwargs).__name__}") + if sub is not None and not isinstance(sub, Runnable): + raise TypeError(f"Runnable sub must be Runnable, got {type(sub).__name__}") + if not isinstance(adapter, str): + raise TypeError(f"Runnable adapter must be string, got {type(adapter).__name__}") + return Runnable(target=target, sub=sub, kwargs=kwargs, adapter=adapter) + + +# Built-in functions available to DML computations +BUILTIN_FNS = { + "get": get, + "contains": contains, + "list": make_list, + "dict": make_dict, + "uri": make_uri, + "runnable": make_runnable, + "assoc": assoc, + "conj": conj, + "unnest": unnest, +} diff --git a/src/daggerml/_internal/config.py b/src/daggerml/_internal/config.py new file mode 100644 index 0000000..363cd8a --- /dev/null +++ b/src/daggerml/_internal/config.py @@ -0,0 +1,453 @@ +from __future__ import annotations + +import os +import tomllib +from dataclasses import dataclass, field +from getpass import getuser +from pathlib import Path +from socket import gethostname +from typing import Any, Mapping +from urllib.parse import urlsplit + +from daggerml._internal.revision_uri import ( + canonicalize_revision_uri, + parse_revision_uri, + stringify_revision_uri, + validate_ref_name, + validate_segment, +) + +_PROJECT_SCOPE = "project/runtime" +_GLOBAL_SCOPE = "global" +_ENV_KEYS: dict[str, str] = { + "project.home": "DML_PROJECT_HOME", + "remote.project": "DML_REMOTE_PROJECT", + "db.path": "DML_DB_PATH", + "remote.root": "DML_REMOTE_ROOT", + "remote.fetch_workers": "DML_REMOTE_FETCH_WORKERS", + "user": "DML_USER", + "default_branch": "DML_DEFAULT_BRANCH", + "config_home": "DML_CONFIG_HOME", +} + + +@dataclass(frozen=True) +class ParsedProjectUri: + owner: str + project: str + branch: str | None = None + tag: str | None = None + + def canonical(self) -> str: + uri = f"dml://{self.owner}/{self.project}" + if self.branch is not None: + return f"{uri}#{self.branch}" + if self.tag is not None: + return f"{uri}@{self.tag}" + return uri + + +def _validate_ref_name(label: str, value: str) -> str: + return validate_ref_name(label, value) + + +def parse_dml_project_uri(uri: str, *, require_identifier: bool = False) -> ParsedProjectUri: + if require_identifier: + parsed = parse_revision_uri(uri, require_identifier=True) + return ParsedProjectUri(parsed.owner, parsed.project, branch=parsed.branch, tag=parsed.tag) + if "#" in uri or "@" in uri: + parsed = parse_revision_uri(uri, require_identifier=True) + return ParsedProjectUri(parsed.owner, parsed.project, branch=parsed.branch, tag=parsed.tag) + if not isinstance(uri, str) or not uri.startswith("dml://"): + raise ValueError(f"Invalid DML URI: {uri!r}") + parsed = urlsplit(uri) + if parsed.scheme != "dml" or not parsed.netloc or parsed.query or parsed.fragment: + raise ValueError(f"Invalid DML URI: {uri!r}") + project = parsed.path.strip("/") + if "/" in project or not project: + raise ValueError(f"Invalid DML URI project path: {uri!r}") + return ParsedProjectUri( + owner=validate_segment("project owner", parsed.netloc), + project=validate_segment("project name", project), + branch=None, + tag=None, + ) + + +def normalize_project_uri(uri: str, *, default_branch: str | None = None, require_branch: bool = False) -> str: + parsed = parse_revision_uri( + uri, + default_branch=default_branch, + require_identifier=require_branch, + ) + return canonicalize_revision_uri(stringify_revision_uri(parsed), require_identifier=True) + + +def validate_remote_root(value: str) -> str: + if not isinstance(value, str): + raise ValueError("remote.root must be a string") + if not value: + return "" + if not value.startswith("s3://"): + raise ValueError("remote.root must be s3://bucket or s3://bucket/prefix") + rest = value[5:] + if not rest: + raise ValueError("remote.root must include a bucket name") + bucket, sep, prefix = rest.partition("/") + if not bucket: + raise ValueError("remote.root must include a bucket name") + if sep and not prefix.strip("/"): + raise ValueError("remote.root prefix must be non-empty when '/' is provided") + return value.rstrip("/") + + +def global_config_home(env: Mapping[str, str] | None = None) -> Path: + env_map = os.environ if env is None else env + if env_map.get("DML_CONFIG_HOME"): + return Path(os.path.expanduser(env_map["DML_CONFIG_HOME"])) + if env_map.get("XDG_CONFIG_HOME"): + return Path(os.path.expanduser(env_map["XDG_CONFIG_HOME"])) / "dml" + return Path(os.path.expanduser("~/.config/dml")) + + +def default_user(env: Mapping[str, str] | None = None) -> str | None: + env_map = os.environ if env is None else env + user = env_map.get("USER") + if not user: + try: + user = getuser() + except Exception: + user = None + if not user: + return None + try: + host = gethostname().split(".", 1)[0] + except Exception: + host = "" + return f"{user}@{host}" if host else user + + +def _read_toml(path: Path) -> dict[str, Any]: + return tomllib.loads(path.read_text()) + + +def _coerce_path(value: object) -> str | None: + if value is None: + return None + if isinstance(value, Path): + value = str(value) + if not isinstance(value, str): + raise ValueError(f"Expected path-like string, got {type(value).__name__}") + return os.path.expanduser(value) + + +def _coerce_positive_int(value: object, *, key: str) -> int | None: + if value is None: + return None + if isinstance(value, bool): + raise ValueError(f"{key} must be a positive integer") + if isinstance(value, int): + parsed = value + elif isinstance(value, str): + text = value.strip() + if not text: + return None + try: + parsed = int(text, 10) + except ValueError as exc: + raise ValueError(f"{key} must be a positive integer") from exc + else: + raise ValueError(f"{key} must be a positive integer") + if parsed <= 0: + raise ValueError(f"{key} must be a positive integer") + return parsed + + +def _load_global_layer(config_home: str, env: Mapping[str, str]) -> dict[str, object]: + path = Path(config_home) / "config.toml" + layer: dict[str, object] = {"config_home": config_home} + if not path.exists(): + return layer + data = _read_toml(path) + defaults = data.get("defaults", {}) or {} + remote = data.get("remote", {}) or {} + user = data.get("user", {}) or {} + layer["user"] = user.get("name") + layer["default_branch"] = defaults.get("branch") + layer["remote.fetch_workers"] = remote.get("fetch_workers") + return layer + + +def _load_project_layer(project_home: str | None) -> dict[str, object]: + if not project_home: + return {} + path = Path(project_home) / ".dml" / "config.toml" + if not path.exists(): + return {} + data = _read_toml(path) + remote = data.get("remote", {}) or {} + layer: dict[str, object] = {"project.home": project_home} + if remote.get("project"): + layer["remote.project"] = str(remote["project"]) + if remote.get("root"): + layer["remote.root"] = remote.get("root") + if remote.get("fetch_workers") is not None: + layer["remote.fetch_workers"] = remote.get("fetch_workers") + return layer + + +def _normalize_key(key: str) -> str: + return key + + +def _normalize_inputs(values: Mapping[str, object] | None) -> dict[str, object]: + out: dict[str, object] = {} + if not values: + return out + for key, value in values.items(): + normalized = _normalize_key(key) + out[normalized] = value + return out + + +def _overlay(base: dict[str, object], layer: Mapping[str, object]) -> dict[str, object]: + out = dict(base) + for key, value in layer.items(): + if value is None: + continue + out[key] = value + return out + + +def _env_layer(env: Mapping[str, str]) -> dict[str, object]: + out: dict[str, object] = {} + for key, name in _ENV_KEYS.items(): + if name not in env: + continue + value = env[name] + if value == "": + continue + out[key] = value + return out + + +@dataclass(frozen=True) +class DmlProjectSettings: + home: str | None = None + + +@dataclass(frozen=True) +class DmlDbSettings: + path: str | None = None + + +@dataclass(frozen=True) +class DmlRemoteSettings: + project: str | None = None + root: str = "" + fetch_workers: int = 16 + + +@dataclass(frozen=True) +class DmlConfig: + project: DmlProjectSettings = field(default_factory=DmlProjectSettings) + db: DmlDbSettings = field(default_factory=DmlDbSettings) + remote: DmlRemoteSettings = field(default_factory=DmlRemoteSettings) + user: str | None = None + default_branch: str = "main" + config_home: str = "" + + @property + def repo(self) -> str | None: + return self.project.home + + @property + def branch(self) -> str: + return self.default_branch + + @property + def db_path(self) -> str | None: + return self.db.path + + @classmethod + def resolve( + cls, + *, + scope: str = _PROJECT_SCOPE, + explicit: Mapping[str, object] | None = None, + env: Mapping[str, str] | None = None, + defaults: Mapping[str, object] | None = None, + ) -> "DmlConfig": + if scope not in {_PROJECT_SCOPE, _GLOBAL_SCOPE}: + raise ValueError(f"Unknown config scope: {scope!r}") + env_map = os.environ if env is None else env + defaults_layer = _normalize_inputs(defaults) + explicit_layer = _normalize_inputs(explicit) + env_layer = _env_layer(env_map) + raw_config_home = ( + explicit_layer.get("config_home") or env_layer.get("config_home") or defaults_layer.get("config_home") + ) + config_home = _coerce_path(raw_config_home) or str(global_config_home(env_map)) + base: dict[str, object] = {"config_home": config_home} + merged = _overlay(base, defaults_layer) + merged = _overlay(merged, _load_global_layer(config_home, env_map)) + project_home_input = ( + explicit_layer.get("project.home") or env_layer.get("project.home") or merged.get("project.home") + ) + project_home = _coerce_path(project_home_input) + if project_home is None and scope == _PROJECT_SCOPE: + project_home = str(Path.cwd()) + if scope == _PROJECT_SCOPE: + merged = _overlay(merged, _load_project_layer(project_home)) + merged = _overlay(merged, env_layer) + merged = _overlay(merged, explicit_layer) + project_home = _coerce_path(merged.get("project.home")) + if project_home is None and scope == _PROJECT_SCOPE: + project_home = str(Path.cwd()) + default_branch_value = merged.get("default_branch") + default_branch = str(default_branch_value) if default_branch_value else "main" + _validate_ref_name("branch", default_branch) + remote_project: str | None = None + raw_remote_project = merged.get("remote.project") + if raw_remote_project is not None: + if not isinstance(raw_remote_project, str): + raise ValueError("remote.project must be a string") + remote_project = validate_dml_project_uri(raw_remote_project) + db_path = _coerce_path(merged.get("db.path")) + if db_path is None and project_home and scope == _PROJECT_SCOPE: + db_path = str(Path(project_home) / ".dml" / "db") + remote_root = merged.get("remote.root") + if remote_root is None: + remote_root_s = "" + else: + if not isinstance(remote_root, str): + raise ValueError("remote.root must be a string") + remote_root_s = validate_remote_root(remote_root) + remote_fetch_workers = _coerce_positive_int(merged.get("remote.fetch_workers"), key="remote.fetch_workers") + if remote_fetch_workers is None: + remote_fetch_workers = 16 + user_value = merged.get("user") + user = str(user_value) if user_value else default_user(env_map) + return cls( + project=DmlProjectSettings(home=project_home), + db=DmlDbSettings(path=db_path), + remote=DmlRemoteSettings(project=remote_project, root=remote_root_s, fetch_workers=remote_fetch_workers), + user=user, + default_branch=default_branch, + config_home=config_home, + ) + + def envvars(self) -> dict[str, object]: + env: dict[str, object] = { + "DML_USER": self.user, + "DML_DEFAULT_BRANCH": self.default_branch, + "DML_CONFIG_HOME": self.config_home, + "DML_DB_PATH": self.db.path, + "DML_REMOTE_ROOT": self.remote.root, + "DML_REMOTE_FETCH_WORKERS": str(self.remote.fetch_workers), + "DML_REMOTE_PROJECT": self.remote.project, + "DML_PROJECT_HOME": self.project.home, + } + return env + + def to_dict(self) -> dict[str, object]: + return { + "project": { + "home": self.project.home, + }, + "db": { + "path": self.db.path, + }, + "remote": { + "project": self.remote.project, + "root": self.remote.root, + "fetch_workers": self.remote.fetch_workers, + }, + "user": self.user, + "default_branch": self.default_branch, + "config_home": self.config_home, + } + + +def _validate_name(label: str, value: str) -> str: + return validate_segment(label, value) + + +def validate_dml_project_uri(uri: str) -> str: + parsed = parse_dml_project_uri(uri, require_identifier=False) + if parsed.branch is not None or parsed.tag is not None: + raise ValueError(f"Project URI must not include a branch or tag: {uri!r}") + return f"dml://{parsed.owner}/{parsed.project}" + + +@dataclass(frozen=True) +class DmlGlobalConfig: + user: str | None = None + default_branch: str = "main" + + @classmethod + def load(cls, config_home: Path | str | None = None, *, env: Mapping[str, str] | None = None) -> "DmlGlobalConfig": + resolved = DmlConfig.resolve( + scope="global", + explicit={"config_home": str(config_home)} if config_home is not None else None, + env=env, + ) + return cls( + user=resolved.user, + default_branch=resolved.default_branch, + ) + + +@dataclass(frozen=True) +class DmlProjectConfig: + name: str | None = None + owner: str | None = None + remote_root: str = "" + + @property + def uri(self) -> str | None: + if self.owner is None or self.name is None: + return None + return validate_dml_project_uri(f"dml://{self.owner}/{self.name}") + + @property + def remote_project(self) -> str | None: + return self.uri + + def __post_init__(self) -> None: + if (self.name is None) != (self.owner is None): + raise ValueError("Project config requires both name and owner when remote.project is configured") + if self.name is not None: + _validate_name("project name", self.name) + if self.owner is not None: + _validate_name("project owner", self.owner) + if self.remote_root: + validate_remote_root(self.remote_root) + + @classmethod + def load(cls, project_dir: Path | str = ".") -> "DmlProjectConfig": + resolved = DmlConfig.resolve(explicit={"project.home": str(project_dir)}, env={}) + if not resolved.remote.project: + return cls(remote_root=resolved.remote.root) + parsed = parse_dml_project_uri(resolved.remote.project, require_identifier=False) + return cls(name=parsed.project, owner=parsed.owner, remote_root=resolved.remote.root) + + def save(self, project_dir: Path | str = ".") -> None: + dml_dir = Path(project_dir) / ".dml" + dml_dir.mkdir(parents=True, exist_ok=True) + lines = ["[remote]"] + if self.remote_project: + lines.append(f'project = "{self.remote_project}"') + if self.remote_root: + lines.append(f'root = "{validate_remote_root(self.remote_root)}"') + (dml_dir / "config.toml").write_text("\n".join(lines) + "\n") + + +def init_project_layout(project_dir: Path | str, cfg: DmlProjectConfig) -> Path: + root = Path(project_dir) + dml_dir = root / ".dml" + db_dir = dml_dir / "db" + db_dir.mkdir(parents=True, exist_ok=True) + (dml_dir / ".gitignore").write_text("db\nHEAD\nrefs\n") + cfg.save(root) + return db_dir diff --git a/src/daggerml/_internal/dml.py b/src/daggerml/_internal/dml.py new file mode 100644 index 0000000..0ba43fe --- /dev/null +++ b/src/daggerml/_internal/dml.py @@ -0,0 +1,1562 @@ +from __future__ import annotations + +import logging +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +from typing import Annotated, Any, Literal, NotRequired, TypedDict, cast, overload + +from daggerml._internal._db import DmlDbEnv, Ref +from daggerml._internal.config import DmlProjectConfig, parse_dml_project_uri +from daggerml._internal.dml_context import ( + config_dict, + current_head_branch, + current_head_state, + db_path_for_project, + gitignore_exists, + load_project_config, + mutable_branch, + project_config_exists, + project_remote_root, + require_project_home, + require_user, + resolve_runtime_context, +) +from daggerml._internal.dml_resolution import ( + resolve_dag_ref, + resolve_node_ref, +) +from daggerml._internal.dml_resolution import ( + resolve_revision as resolve_revision_value, +) +from daggerml._internal.dml_resolution import ( + resolve_revision_ref as resolve_revision_ref_value, +) +from daggerml._internal.ops.cache import CacheOps +from daggerml._internal.ops.commit import CommitOps +from daggerml._internal.ops.config import ConfigOps +from daggerml._internal.ops.dag import DagOps +from daggerml._internal.ops.gc import GcOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.ops.remote import RemoteOps +from daggerml._internal.types import DEFAULT_HEAD, NAMESPACES, DmlRepoError, Error, Runnable, Uri + +logger = logging.getLogger(__name__) + +_RUNTIME_CANCEL_MAX_ATTEMPTS = 3 +_RUNTIME_CANCEL_BACKOFF_SECONDS = 0.05 +_DB_MAP_SIZE = 1024**3 + + +class ProjectConfigPayload(TypedDict): + home: str | None + uri: str | None + + +class DbConfigPayload(TypedDict): + path: str | None + + +class RemoteConfigPayload(TypedDict): + root: str + fetch_workers: int + + +class ConfigShowPayload(TypedDict): + project: ProjectConfigPayload + db: DbConfigPayload + remote: RemoteConfigPayload + user: str | None + default_branch: str + config_home: str + + +class ConfigShowContribPayload(ConfigShowPayload): + contrib: dict[str, Any] + + +class CommitPayload(TypedDict): + ref: Ref + parents: list[Ref] + tree: Ref + author: str | None + message: str | None + dag: Ref | None + created: int + modified: int + + +class RevisionPayload(TypedDict): + input: str + kind: str + commit: Ref + branch: str | None + tag: str | None + + +class DagSummaryPayload(TypedDict): + nodes: list[Ref] + names: dict[str, Ref] + result: Ref | None + argv: Ref | None + kwargv: Ref | None + ref: Ref + + +class NodeDescriptionPayload(TypedDict): + ref: Ref + type: str + value_ref: Ref + dag: NotRequired[Ref] + argv: NotRequired[list[Ref]] + node: NotRequired[Ref] + + +class DagPayload(TypedDict): + nodes: list[NodeDescriptionPayload] + names: dict[str, Ref] + result: Ref | None + argv: Ref | None + kwargv: Ref | None + ref: Ref + + +NodeValue = None | int | float | str | bool | Uri | Runnable | list[Ref] | dict[str, Ref] +NodeUnrolledValue = None | int | float | str | bool | Uri | Runnable | list[Any] | dict[str, Any] + + +class NodeDescribePayload(TypedDict): + node: NodeDescriptionPayload + + +class NodeDescribeWithRevisionPayload(NodeDescribePayload): + revision: RevisionPayload + + +class NodeGetPayload(TypedDict): + node: NodeValue + + +class NodeGetWithRevisionPayload(NodeGetPayload): + revision: RevisionPayload + + +class NodeUnrollPayload(TypedDict): + node: NodeUnrolledValue + + +class NodeUnrollWithRevisionPayload(NodeUnrollPayload): + revision: RevisionPayload + + +class DagListPayload(TypedDict): + revision: RevisionPayload + dags: dict[str, Ref] + + +class DagDescribePayload(TypedDict): + dag: DagSummaryPayload + + +class DagDescribeWithRevisionPayload(DagDescribePayload): + revision: RevisionPayload + + +class DagGetPayload(TypedDict): + dag: DagPayload + + +class DagGetWithRevisionPayload(DagGetPayload): + revision: RevisionPayload + + +class DagMapDiffPayload(TypedDict): + added: dict[str, Ref] + removed: dict[str, Ref] + updated: dict[str, dict[str, Ref]] + + +class ShowChangePayload(DagMapDiffPayload): + base: Ref | None + + +class LogPayload(TypedDict): + revision: RevisionPayload + commits: list[CommitPayload] + + +class ShowPayload(TypedDict): + revision: RevisionPayload + commit: CommitPayload + dags: dict[str, Ref] + change: ShowChangePayload + + +class DiffPayload(DagMapDiffPayload): + left: RevisionPayload + right: RevisionPayload + + +class HeadStatePayload(TypedDict): + mode: str + branch: str | None + commit: Ref + + +class StatusPayload(TypedDict): + head: HeadStatePayload | None + branches: list[str] + dags: dict[str, Ref] + indexes: list[str] + + +class BranchLocalPayload(TypedDict): + branches: list[str] + head: str | None + remote: Literal[False] + + +class BranchRemotePayload(TypedDict): + branches: list[str] + remote: Literal[True] + + +class CheckoutAttachedPayload(TypedDict): + mode: Literal["attached"] + branch: str + + +class CheckoutDetachedPayload(TypedDict): + mode: Literal["detached"] + branch: None + + +class IndexDescribePayload(TypedDict): + id: str + commit: Ref + dag: Ref | None + nodes: list[Ref] + names: dict[str, Ref] + result: Ref | None + argv: Ref | None + kwargv: Ref | None + + +class RuntimeCancelPayload(TypedDict): + index_id: str + iterations: int + graph_edges: int + candidate_count: int + own_execution_count: int + cancelled_count: int + dropped_count: int + lock_retry_count: int + + +class IndexCommitPayload(TypedDict): + ref: Ref + summary: CommitPayload + + +class AdminIndexItemPayload(TypedDict): + id: str + commit: IndexCommitPayload + dag: Ref | None + nodes: list[Ref] + names: dict[str, Ref] + result: Ref | None + argv: Ref | None + kwargv: Ref | None + + +class AdminIndexListPayload(TypedDict): + indexes: list[AdminIndexItemPayload] + + +class AdminIndexGetPayload(TypedDict): + index: AdminIndexItemPayload + + +class AdminIndexDeletePayload(TypedDict): + index: str + deleted: Literal[True] + + +class AdminCacheInvalidatePayload(TypedDict): + cache_keys: list[str] + invalidated: dict[str, Any] + + +class AdminRemoteProjectsPayload(TypedDict): + projects: list[str] + + +class AdminRemoteProjectRefsPayload(TypedDict): + project: str + branches: list[str] + tags: list[str] + + +class AdminRemoteGcPayload(TypedDict): + deleted: int + kept_live: int + kept_young: int + + +MalformedPolicy = Literal["raise", "warn", "ignore"] + + +class AdminGcDryRunPayload(TypedDict): + dry_run: Literal[True] + would_delete: int + orphans: list[Ref] + + +class AdminGcRunPayload(TypedDict): + dry_run: Literal[False] + deleted: int + + +class InitCreatedStatePayload(TypedDict): + db: bool + config: bool + + +class InitPayload(TypedDict): + project_home: str + remote_root: str | None + user: str | None + config_home: str | None + created: InitCreatedStatePayload + + +def require_exact_ref(value: Ref, expected_root_ns: str) -> Ref: + if not isinstance(value, Ref): + raise DmlRepoError(f"Expected {expected_root_ns} Ref, got: {type(value).__name__}") + if value.nss()[0] != expected_root_ns: + raise DmlRepoError(f"Expected {expected_root_ns} ref, got: {value}") + return value + + +@contextmanager +def with_db(dml: "Dml", map_size: int = _DB_MAP_SIZE): + project_home = require_project_home(dml._context.project_home) + db = DmlDbEnv.open(str(db_path_for_project(project_home)), namespaces=sorted(NAMESPACES), map_size=map_size) + try: + yield db + finally: + db.close() + + +def create_db(project_home: str, *, branch: str | None = None) -> None: + branch = branch or DEFAULT_HEAD + db_path = db_path_for_project(project_home) + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES), map_size=_DB_MAP_SIZE) + try: + head_ops = HeadOps(_db=db) + head_ops.create_branch(branch) + head_ops.write_attached_head(branch) + finally: + db.close() + + +def make_commit_ops(db: DmlDbEnv) -> CommitOps: + return CommitOps(_db=db) + + +def make_head_ops(db: DmlDbEnv) -> HeadOps: + return HeadOps(_db=db) + + +def make_index_ops(db: DmlDbEnv, dml: "Dml") -> IndexOps: + return IndexOps(_db=db, remote_root=dml._context.remote_root) + + +def make_dag_ops(db: DmlDbEnv) -> DagOps: + return DagOps(_db=db) + + +def make_node_ops(db: DmlDbEnv) -> NodeOps: + return NodeOps(_db=db) + + +def make_cache_ops(db: DmlDbEnv, dml: "Dml") -> CacheOps: + return CacheOps(_db=db, remote_root=dml._context.remote_root) + + +def make_gc_ops(db: DmlDbEnv) -> GcOps: + return GcOps(_db=db) + + +def split_remote_root(remote_root: str) -> tuple[str, str]: + if not remote_root.startswith("s3://"): + raise ValueError(f"Invalid remote root URI: {remote_root!r}") + rest = remote_root[5:] + if not rest: + raise ValueError(f"Invalid remote root URI: {remote_root!r}") + if "/" not in rest: + return rest, "dml" + bucket, prefix = rest.split("/", 1) + prefix = prefix.strip("/") + return bucket, f"{prefix}/dml" if prefix else "dml" + + +def make_remote_ops(db: DmlDbEnv, dml: "Dml") -> RemoteOps: + bucket, prefix = split_remote_root(dml._context.remote_root) + remote_kwargs: dict[str, Any] = { + "bucket": bucket, + "prefix": prefix, + "fetch_workers": dml._context.config.remote.fetch_workers, + "client": dml._s3_client, + } + return RemoteOps(_db=db, **remote_kwargs) + + +def config_ops(dml: "Dml"): + return ConfigOps(project_home=dml._context.project_home, config_home=dml._context.config.config_home) + + +def tree_dags(dml: "Dml", tree_ref: Ref) -> dict[str, Ref]: + with with_db(dml) as db: + with make_commit_ops(db)._tx(readonly=True) as txn: + tree = txn.get(tree_ref) + return dict(tree.dags) + + +def dag_map_for_commit(dml: "Dml", commit_ref: Ref) -> dict[str, Ref]: + with with_db(dml) as db: + tree_ref = make_commit_ops(db).describe(commit_ref)["tree"] + return tree_dags(dml, tree_ref) + + +def dag_summary_payload(dml: "Dml", dag_ref: Ref) -> DagSummaryPayload: + with with_db(dml) as db: + dag = dict(make_dag_ops(db).describe(dag_ref)) + dag.pop("id", None) + dag["ref"] = dag_ref + return cast(DagSummaryPayload, dag) + + +def dag_map_diff(left: dict[str, Ref], right: dict[str, Ref]) -> DagMapDiffPayload: + added: dict[str, Ref] = {} + removed: dict[str, Ref] = {} + updated: dict[str, dict[str, Ref]] = {} + for name in sorted(set(left) | set(right)): + before = left.get(name) + after = right.get(name) + if before is None and after is not None: + added[name] = after + elif before is not None and after is None: + removed[name] = before + elif before is not None and after is not None and before != after: + updated[name] = {"before": before, "after": after} + return {"added": added, "removed": removed, "updated": updated} + + +def revision_payload(value: str, resolved) -> RevisionPayload: + return { + "input": value, + "kind": resolved.kind, + "commit": resolved.commit, + "branch": resolved.branch, + "tag": resolved.tag, + } + + +def commit_payload(commit_ref: Ref, summary: dict[str, Any]) -> CommitPayload: + summary = dict(summary) + summary.pop("id", None) + summary["ref"] = commit_ref + return cast(CommitPayload, summary) + + +def node_description_payload(node_info: dict[str, Any]) -> NodeDescriptionPayload: + payload = dict(node_info) + payload.pop("id", None) + return cast(NodeDescriptionPayload, payload) + + +def remote_tracking_branches(dml: "Dml") -> list[str]: + project_home = require_project_home(dml._context.project_home) + remote_root = Path(project_home) / ".dml" / "refs" / "remote" + if not remote_root.exists(): + return [] + branches: list[str] = [] + for ref_path in sorted(remote_root.glob("*/*/heads/**/*")): + if not ref_path.is_file() or ref_path.name.endswith(".lock"): + continue + relative = ref_path.relative_to(remote_root) + parts = relative.parts + if len(parts) < 4: + continue + owner = parts[0] + project = parts[1] + branch_name = "/".join(parts[3:]) + branches.append(f"dml://{owner}/{project}#{branch_name}") + return branches + + +def dag_payload(dml: "Dml", dag_ref: Ref) -> DagPayload: + summary = dag_summary_payload(dml, dag_ref) + node_refs = list(summary["nodes"]) + with with_db(dml) as db: + node_ops = make_node_ops(db) + nodes = [node_description_payload(node_ops.describe(node_ref)) for node_ref in node_refs] + return { + "nodes": nodes, + "names": summary["names"], + "result": summary["result"], + "argv": summary["argv"], + "kwargv": summary["kwargv"], + "ref": summary["ref"], + } + + +def resolve_dml_revision(dml: "Dml", value: str): + with with_db(dml) as db: + return resolve_revision_value( + value=value, + commit_ops=make_commit_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(dml._context.project_home), + ) + + +def resolve_dml_revision_ref(dml: "Dml", value: str) -> Ref: + with with_db(dml) as db: + return resolve_revision_ref_value( + value=value, + commit_ops=make_commit_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(dml._context.project_home), + ) + + +def create_s3_client(): + import boto3 + from botocore.config import Config + + return boto3.client("s3", config=Config(max_pool_connections=20)) + + +@dataclass(frozen=True) +class _ConfigNamespace: + """Read and update resolved DaggerML configuration values.""" + + _dml: "Dml" + + def get( + self, + key: Annotated[str, "Configuration setting to resolve, such as remote.root or user."], + *, + scope: Annotated[Literal["global", "local"], "Config scope to read from."] = "local", + ): + """Return the resolved value for a configuration setting in the selected scope.""" + return config_ops(self._dml).get(key, scope=scope) + + def set( + self, + key: Annotated[str, "Configuration setting to update."], + value: Annotated[str, "Replacement value to write for the setting."], + scope: Annotated[Literal["global", "local"], "Config scope to update."] = "local", + ): + """Persist one configuration setting in the selected config file.""" + return config_ops(self._dml).set(key, value, scope=scope) + + @overload + def show(self, *, contrib: Literal[False] = False) -> ConfigShowPayload: ... + @overload + def show(self, *, contrib: Literal[True]) -> ConfigShowContribPayload: ... + def show( + self, + *, + contrib: Annotated[bool, "Include contrib runtime status alongside core config data."] = False, + ) -> ConfigShowPayload: + """Return the active configuration payload, optionally with contrib status.""" + payload = cast(ConfigShowPayload, config_dict(self._dml._context.config)) + if contrib: + from daggerml.contrib import status as contrib_status + + return cast(ConfigShowContribPayload, {**payload, "contrib": contrib_status.status()}) + return payload + + +@dataclass(frozen=True) +class _RuntimeNamespace: + """Manage mutable runtime indexes and staged DAG execution state.""" + + _dml: "Dml" + + ######## Dag runtime operations ######## + def create( + self, + *, + head: Annotated[str | None, "Branch name to base the index on."] = None, + commit: Annotated[Ref | None, "Commit to base the index on when not using a branch."] = None, + argv_ptr: Annotated[ + str | None, + "Remote argv pointer to import when resuming external execution state.", + ] = None, + index_id: Annotated[str | None, "Explicit runtime identifier to reuse or create."] = None, + ) -> str: + """Create a runtime workspace from HEAD, a branch, a commit, or an argv pointer.""" + with with_db(self._dml) as db: + if head is None and commit is None and argv_ptr is None: + head_state = make_head_ops(db).get_head_state() + head = head_state.branch + commit = head_state.commit if head is None else None + return make_index_ops(db, self._dml).create(head=head, commit=commit, argv_ptr=argv_ptr, index_id=index_id) + + def get_node( + self, + index_id: Annotated[str, "Runtime workspace to read from."], + name: Annotated[str, "Named node to resolve inside the runtime workspace."], + ) -> Ref: + """Return the stored identifier for a named node in a runtime workspace.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).get_node(index_id, name) + + def get_argv(self, index_id: Annotated[str, "Runtime workspace to read from."]) -> Ref: + """Return the argv node for a runtime workspace.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).get_argv(index_id) + + def put_literal( + self, + index_id: Annotated[str, "Runtime workspace to mutate."], + value: Annotated[Any, "Literal value to stage into the workspace."], + *, + name: Annotated[str | None, "Optional name to assign to the staged value."] = None, + ) -> Ref: + """Stage a literal value in the runtime workspace and optionally name it.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).put_literal(index_id, value, name=name) + + def put_import( + self, + index_id: Annotated[str, "Runtime workspace to mutate."], + dag: Annotated[Ref, "Committed DAG to import from."], + *, + node: Annotated[Ref | None, "Specific node to import from that DAG."] = None, + name: Annotated[str | None, "Optional name for the imported node."] = None, + ) -> Ref: + """Import a committed DAG node into the runtime workspace.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).put_import(index_id, dag, node=node, name=name) + + def set_node_name( + self, + index_id: Annotated[str, "Runtime workspace to mutate."], + name: Annotated[str, "Name to bind inside the runtime index."], + node: Annotated[Ref, "Existing node to bind to that name."], + ) -> Ref: + """Bind an existing node to a name in the runtime workspace.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).set_node_name(index_id, name, node) + + def start_fn( + self, + index_id: Annotated[str, "Runtime workspace to execute in."], + argv: Annotated[list[Ref], "Ordered arguments with the callable in the first position."], + *, + kwargv: Annotated[dict[str, Ref] | None, "Optional keyword arguments keyed by parameter name."] = None, + name: Annotated[str | None, "Optional result name for the staged call."] = None, + ) -> Ref | None: + """Stage a function call in the runtime workspace and return the result node.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).start_fn(index_id, argv, kwargv=kwargv, name=name) + + def commit( + self, + index_id: Annotated[str, "Runtime workspace to commit."], + value: Annotated[Ref | Error, "Final DAG result as an existing node or an Error value."], + *, + head: Annotated[str | None, "Branch to update; defaults to the mutable current branch."] = None, + message: Annotated[str | None, "Commit message to store with the new commit."] = None, + dag_name: Annotated[str | None, "Optional DAG name to update in the target commit."] = None, + ) -> Ref: + """Commit a runtime workspace into repository history.""" + with with_db(self._dml) as db: + return make_index_ops(db, self._dml).commit(index_id, value, head=head, message=message, dag_name=dag_name) + + ######## Meta runtime operations ######## + def list(self) -> list[str]: + """List runtime workspaces currently tracked in the repository.""" + with with_db(self._dml) as db: + return make_head_ops(db).list_indexes() + + def describe(self, index_id: Annotated[str, "Runtime workspace to inspect."]) -> IndexDescribePayload: + """Return structural metadata for a runtime workspace.""" + with with_db(self._dml) as db: + return cast(IndexDescribePayload, make_index_ops(db, self._dml).describe(index_id)) + + def cancel( + self, + index_id: Annotated[str, "Runtime workspace whose active executions should be cancelled."], + ) -> RuntimeCancelPayload: + """Cancel tracked executions for a runtime workspace and report cancellation statistics.""" + requested_by = require_user(self._dml._context.user, message="user is required for runtime cancel") + with with_db(self._dml) as db: + index = make_index_ops(db, self._dml) + plan = index.cancel(index_id, requested_by=requested_by) + candidate_set = set(cast(set[str], plan["candidate_set"])) + own_executions = set(cast(set[str], plan["own_executions"])) + retry_counts = {candidate_id: 0 for candidate_id in candidate_set} + adapter_retry_candidates: set[str] = set() + stats: RuntimeCancelPayload = { + "index_id": index_id, + "iterations": 0, + "graph_edges": len(cast(set[tuple[str, str]], plan["graph"])), + "candidate_count": len(candidate_set), + "own_execution_count": 0, + "cancelled_count": 0, + "dropped_count": 0, + "lock_retry_count": 0, + } + while candidate_set: + stats["iterations"] += 1 + batch = sorted(candidate_set) + normal_retry_pending = False + logger.info( + "runtime.cancel iteration=%s index_id=%s candidates=%s owned=%s", + stats["iterations"], + index_id, + len(batch), + len(own_executions), + ) + with ThreadPoolExecutor() as executor: + futures = { + executor.submit( + index._cancel_execution_candidate, + candidate_id, + requested_by=requested_by, + own_executions=set(own_executions), + ): candidate_id + for candidate_id in batch + } + for future in as_completed(futures): + candidate_id = futures[future] + try: + result = future.result() + except Exception as exc: + retry_counts[candidate_id] += 1 + if retry_counts[candidate_id] >= _RUNTIME_CANCEL_MAX_ATTEMPTS: + raise DmlRepoError( + f"runtime.cancel exceeded retry limit for execution {candidate_id}: {exc}" + ) from exc + adapter_retry_candidates.add(candidate_id) + continue + if cast(bool, result["lock_retry"]): + stats["lock_retry_count"] += 1 + normal_retry_pending = True + outcome = cast(int | None, result["outcome"]) + if outcome == 1: + candidate_set.discard(candidate_id) + stats["cancelled_count"] += 1 + elif outcome == -1: + candidate_set.discard(candidate_id) + if candidate_id in own_executions: + own_executions.discard(candidate_id) + stats["dropped_count"] += 1 + elif outcome is None: + normal_retry_pending = True + if adapter_retry_candidates or normal_retry_pending: + delay = _RUNTIME_CANCEL_BACKOFF_SECONDS + if adapter_retry_candidates: + delay *= 2 ** (max(retry_counts[candidate_id] for candidate_id in adapter_retry_candidates) - 1) + time.sleep(delay) + adapter_retry_candidates.clear() + index._complete_index_cancellation( + index_id, + cancelled_path=cast(Path, plan["cancelled_path"]), + own_executions=own_executions, + ) + stats["own_execution_count"] = len(own_executions) + logger.info( + "runtime.cancel complete index_id=%s iterations=%s cancelled=%s dropped=%s lock_retries=%s", + index_id, + stats["iterations"], + stats["cancelled_count"], + stats["dropped_count"], + stats["lock_retry_count"], + ) + return stats + + +@dataclass(frozen=True) +class _DagNamespace: + """Inspect committed DAG state and apply DAG-level history operations.""" + + _dml: "Dml" + + def list( + self, + revision: Annotated[str, "Revision selector such as HEAD, HEAD~1, main, or origin/main."] = "HEAD", + ) -> DagListPayload: + """List named DAGs visible from the selected revision.""" + resolved = resolve_dml_revision(self._dml, revision) + return { + "revision": revision_payload(revision, resolved), + "dags": dag_map_for_commit(self._dml, resolved.commit), + } + + @overload + def describe(self, value: str | Ref, *, revision: None = None) -> DagDescribePayload: ... + @overload + def describe(self, value: str | Ref, *, revision: str) -> DagDescribeWithRevisionPayload: ... + def describe( + self, + value: Annotated[str | Ref, "DAG by name or exact Ref."], + *, + revision: Annotated[str | None, "Optional revision when the DAG value is name-based."] = None, + ) -> DagDescribePayload: + """Return DAG metadata without materializing full node values.""" + with with_db(self._dml) as db: + if isinstance(value, Ref): + if revision is not None: + raise DmlRepoError("dml dag describe rejects --revision with explicit dag refs") + resolved = None + dag_ref = require_exact_ref(value, "dag") + else: + resolved = resolve_dag_ref( + value=value, + revision=revision, + commit_ops=make_commit_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(self._dml._context.project_home), + operation="describe", + ) + dag_ref = resolved.ref + payload: DagDescribePayload = {"dag": dag_summary_payload(self._dml, dag_ref)} + if resolved is not None and resolved.revision is not None: + return cast( + DagDescribeWithRevisionPayload, + {**payload, "revision": revision_payload(revision or "HEAD", resolved.revision)}, + ) + return payload + + @overload + def get(self, value: str | Ref, *, revision: None = None) -> DagGetPayload: ... + @overload + def get(self, value: str | Ref, *, revision: str) -> DagGetWithRevisionPayload: ... + def get( + self, + value: Annotated[str | Ref, "DAG by name or exact Ref."], + *, + revision: Annotated[str | None, "Optional revision when the DAG value is name-based."] = None, + ) -> DagGetPayload: + """Return a DAG payload including described nodes from the selected revision.""" + with with_db(self._dml) as db: + if isinstance(value, Ref): + if revision is not None: + raise DmlRepoError("dml dag get rejects --revision with explicit dag refs") + resolved = None + dag_ref = require_exact_ref(value, "dag") + else: + resolved = resolve_dag_ref( + value=value, + revision=revision, + commit_ops=make_commit_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(self._dml._context.project_home), + operation="get", + ) + dag_ref = resolved.ref + payload: DagGetPayload = {"dag": dag_payload(self._dml, dag_ref)} + if resolved is not None and resolved.revision is not None: + return cast( + DagGetWithRevisionPayload, + {**payload, "revision": revision_payload(revision or "HEAD", resolved.revision)}, + ) + return payload + + @overload + def describe_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: None = None, + ) -> NodeDescribePayload: ... + @overload + def describe_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: str, + ) -> NodeDescribeWithRevisionPayload: ... + def describe_node( + self, + node: Annotated[ + str | Ref, + "Node by name or exact Ref; examples: result, answer, Ref('node-literal:1').", + ], + *, + dag: Annotated[ + str | Ref | None, + "Optional DAG by name or exact Ref when node is name-based; examples: train, Ref('dag:1').", + ] = None, + revision: Annotated[str | None, "Optional revision selector such as HEAD or main."] = None, + ) -> NodeDescribePayload: + """Describe a committed node without loading its full value.""" + with with_db(self._dml) as db: + if isinstance(node, Ref): + if dag is not None or revision is not None: + raise DmlRepoError("dml dag describe-node rejects dag and revision with explicit node refs") + resolved = None + node_ref = require_exact_ref(node, "node") + elif isinstance(dag, Ref): + if revision is not None: + raise DmlRepoError("dml dag describe-node rejects --revision with explicit dag refs") + resolved = None + node_ref = make_dag_ops(db).get_node(require_exact_ref(dag, "dag"), node) + else: + resolved = resolve_node_ref( + value=node, + dag=dag, + revision=revision, + commit_ops=make_commit_ops(db), + dag_ops=make_dag_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(self._dml._context.project_home), + operation="describe-node", + ) + node_ref = resolved.ref + described_node = node_description_payload(make_node_ops(db).describe(node_ref)) + payload: NodeDescribePayload = {"node": described_node} + if resolved is not None and resolved.revision is not None: + return cast( + NodeDescribeWithRevisionPayload, + {**payload, "revision": revision_payload(revision or "HEAD", resolved.revision)}, + ) + return payload + + @overload + def get_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: None = None, + ) -> NodeGetPayload: ... + @overload + def get_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: str, + ) -> NodeGetWithRevisionPayload: ... + def get_node( + self, + node: Annotated[ + str | Ref, + "Node by name or exact Ref; examples: result, answer, Ref('node-literal:1').", + ], + *, + dag: Annotated[ + str | Ref | None, + "Optional DAG by name or exact Ref when node is name-based; examples: train, Ref('dag:1').", + ] = None, + revision: Annotated[str | None, "Optional revision selector such as HEAD or main."] = None, + ) -> NodeGetPayload: + """Return the value for a committed node.""" + with with_db(self._dml) as db: + if isinstance(node, Ref): + if dag is not None or revision is not None: + raise DmlRepoError("dml dag get-node rejects dag and revision with explicit node refs") + resolved = None + node_ref = require_exact_ref(node, "node") + elif isinstance(dag, Ref): + if revision is not None: + raise DmlRepoError("dml dag get-node rejects --revision with explicit dag refs") + resolved = None + node_ref = make_dag_ops(db).get_node(require_exact_ref(dag, "dag"), node) + else: + resolved = resolve_node_ref( + value=node, + dag=dag, + revision=revision, + commit_ops=make_commit_ops(db), + dag_ops=make_dag_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(self._dml._context.project_home), + operation="get-node", + ) + node_ref = resolved.ref + node_value = make_node_ops(db).get(node_ref) + payload: NodeGetPayload = {"node": cast(NodeValue, node_value)} + if resolved is not None and resolved.revision is not None: + return cast( + NodeGetWithRevisionPayload, + {**payload, "revision": revision_payload(revision or "HEAD", resolved.revision)}, + ) + return payload + + @overload + def unroll_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: None = None, + ) -> NodeUnrollPayload: ... + @overload + def unroll_node( + self, + node: str | Ref, + *, + dag: str | Ref | None = None, + revision: str, + ) -> NodeUnrollWithRevisionPayload: ... + def unroll_node( + self, + node: Annotated[ + str | Ref, + "Node by name or exact Ref; examples: result, answer, Ref('node-literal:1').", + ], + *, + dag: Annotated[ + str | Ref | None, + "Optional DAG by name or exact Ref when node is name-based; examples: train, Ref('dag:1').", + ] = None, + revision: Annotated[str | None, "Optional revision selector such as HEAD or main."] = None, + ) -> NodeUnrollPayload: + """Return the recursively unrolled value for a committed node.""" + with with_db(self._dml) as db: + if isinstance(node, Ref): + if dag is not None or revision is not None: + raise DmlRepoError("dml dag unroll-node rejects dag and revision with explicit node refs") + resolved = None + node_ref = require_exact_ref(node, "node") + elif isinstance(dag, Ref): + if revision is not None: + raise DmlRepoError("dml dag unroll-node rejects --revision with explicit dag refs") + resolved = None + node_ref = make_dag_ops(db).get_node(require_exact_ref(dag, "dag"), node) + else: + resolved = resolve_node_ref( + value=node, + dag=dag, + revision=revision, + commit_ops=make_commit_ops(db), + dag_ops=make_dag_ops(db), + head_ops=make_head_ops(db), + project_dir=require_project_home(self._dml._context.project_home), + operation="unroll-node", + ) + node_ref = resolved.ref + unrolled_node = make_node_ops(db).unroll(node_ref) + payload: NodeUnrollPayload = {"node": cast(NodeUnrolledValue, unrolled_node)} + if resolved is not None and resolved.revision is not None: + return cast( + NodeUnrollWithRevisionPayload, + {**payload, "revision": revision_payload(revision or "HEAD", resolved.revision)}, + ) + return payload + + def checkout( + self, + revision: Annotated[str, "Revision selector to copy from; examples: HEAD, main, origin/main."], + dag_name: Annotated[str, "Name of the DAG to copy from the source revision."], + *, + branch: Annotated[str | None, "Target branch to mutate; defaults to the active attached branch."] = None, + target_name: Annotated[str | None, "Optional new name for the checked-out DAG."] = None, + replace: Annotated[bool, "Replace an existing DAG with the same target name if present."] = False, + user: Annotated[str | None, "User recorded as the DAG checkout author."] = None, + ) -> Ref: + """Copy a DAG from a revision into a mutable branch.""" + author = require_user(user or self._dml._context.user, message="user is required for dag checkout") + resolved_revision = resolve_dml_revision_ref(self._dml, revision) + with with_db(self._dml) as db: + target_branch = mutable_branch(branch=branch, head_ops=make_head_ops(db)) + return make_commit_ops(db).checkout_dag( + target_branch, + resolved_revision, + dag_name, + target_name=target_name, + replace=replace, + user=author, + ) + + def delete( + self, + name: Annotated[str, "Name of the DAG to remove from the branch."], + *, + branch: Annotated[str | None, "Target branch to mutate; defaults to the active attached branch."] = None, + user: Annotated[str | None, "User recorded as the delete author."] = None, + ): + """Delete a named DAG from a mutable branch.""" + author = require_user(user or self._dml._context.user, message="user is required for dag delete") + with with_db(self._dml) as db: + return make_commit_ops(db).delete_dag(name, branch, author) + + +@dataclass(frozen=True) +class _AdminIndexNamespace: + """Inspect and manage runtime workspaces directly for administrative workflows.""" + + _dml: "Dml" + + def list(self) -> AdminIndexListPayload: + """List runtime workspaces with their commit summaries.""" + with with_db(self._dml) as db: + indexes = [self.get(index_id)["index"] for index_id in make_head_ops(db).list_indexes()] + return {"indexes": indexes} + + def get(self, index_id: Annotated[str, "Runtime workspace to inspect."]) -> AdminIndexGetPayload: + """Return a runtime workspace payload with embedded commit summary data.""" + with with_db(self._dml) as db: + index = dict(make_index_ops(db, self._dml).describe(index_id)) + commit_ref = index["commit"] + index["commit"] = { + "ref": commit_ref, + "summary": commit_payload(commit_ref, make_commit_ops(db).describe(commit_ref)), + } + return {"index": cast(AdminIndexItemPayload, index)} + + def delete(self, index_id: Annotated[str, "Runtime workspace to delete."]) -> AdminIndexDeletePayload: + """Delete a runtime workspace immediately.""" + with with_db(self._dml) as db: + make_index_ops(db, self._dml).delete(index_id) + return {"index": index_id, "deleted": True} + + +@dataclass(frozen=True) +class _AdminCacheNamespace: + """Perform administrative operations against remote-backed cache state.""" + + _dml: "Dml" + + def invalidate( + self, + cache_keys: Annotated[list[str], "Exact cache entries to invalidate; wildcards and prefixes are not accepted."], + ) -> AdminCacheInvalidatePayload: + """Invalidate exact remote cache keys and return the backend response.""" + if not cache_keys: + raise DmlRepoError("At least one cache key is required") + for cache_key in cache_keys: + if ":" in cache_key: + raise DmlRepoError("Admin cache invalidation accepts exact cache keys only") + requested_by = self._dml._context.user or "cli" + with with_db(self._dml) as db: + invalidated = make_remote_ops(db, self._dml).invalidate_cache(cache_keys, requested_by=requested_by) + return {"cache_keys": cache_keys, "invalidated": invalidated} + + +@dataclass(frozen=True) +class _AdminRemoteNamespace: + """Inspect and clean remote project metadata stored under the configured remote root.""" + + _dml: "Dml" + + @overload + def list(self, project: None = None, *, owner: str | None = None) -> AdminRemoteProjectsPayload: ... + @overload + def list(self, project: str, *, owner: None = None) -> AdminRemoteProjectRefsPayload: ... + def list( + self, + project: Annotated[str | None, "Bare project URI such as dml://alice/demo."] = None, + *, + owner: Annotated[str | None, "Filter project listing to one owner when project is omitted."] = None, + ) -> AdminRemoteProjectsPayload | AdminRemoteProjectRefsPayload: + """List remote projects or the branch and tag refs for one remote project.""" + with with_db(self._dml) as db: + remote = make_remote_ops(db, self._dml) + refs = remote.list("projects") + if project is None: + projects: set[str] = set() + for ref in refs: + ref_path = ref.get("ref_path") + if not isinstance(ref_path, str): + continue + parts = ref_path.split("/") + if len(parts) < 5 or parts[0] != "projects": + continue + candidate_owner = parts[1] + candidate_project = parts[2] + if owner is not None and candidate_owner != owner: + continue + projects.add(f"dml://{candidate_owner}/{candidate_project}") + return {"projects": sorted(projects)} + + if not project.startswith("dml://") or "#" in project or "@" in project: + raise DmlRepoError("Admin remote list expects a bare dml:/// project URI") + with with_db(self._dml) as db: + parsed = make_remote_ops(db, self._dml).parse_dml_uri(project, require_identifier=False) + branches: list[str] = [] + tags: list[str] = [] + for ref in refs: + ref_path = ref.get("ref_path") + if not isinstance(ref_path, str): + continue + parts = ref_path.split("/") + if len(parts) < 5 or parts[0] != "projects" or parts[1] != parsed.owner or parts[2] != parsed.project: + continue + name = "/".join(parts[4:]) + if not name.endswith(".json"): + continue + name = name[:-5] + if parts[3] == "heads": + branches.append(f"dml://{parsed.owner}/{parsed.project}#{name}") + elif parts[3] == "tags": + tags.append(f"dml://{parsed.owner}/{parsed.project}@{name}") + return {"project": project, "branches": sorted(branches), "tags": sorted(tags)} + + def gc( + self, + *, + min_age_seconds: Annotated[int, "Minimum object age in seconds before remote GC may delete it."] = 24 * 3600, + malformed: Annotated[MalformedPolicy, "How to handle malformed remote metadata during GC."] = "warn", + ) -> AdminRemoteGcPayload: + """Delete old remote objects that are no longer live under the configured remote root.""" + with with_db(self._dml) as db: + return cast( + AdminRemoteGcPayload, + make_remote_ops(db, self._dml).gc(min_age_seconds=min_age_seconds, malformed=malformed), + ) + + +@dataclass(frozen=True) +class _AdminNamespace: + """Administrative maintenance surface for indexes, cache state, remotes, and GC.""" + + _dml: "Dml" + + @property + def index(self) -> _AdminIndexNamespace: + return _AdminIndexNamespace(self._dml) + + @property + def cache(self) -> _AdminCacheNamespace: + return _AdminCacheNamespace(self._dml) + + @property + def remote(self) -> _AdminRemoteNamespace: + return _AdminRemoteNamespace(self._dml) + + @overload + def gc(self, *, dry_run: Literal[False] = False) -> AdminGcRunPayload: ... + @overload + def gc(self, *, dry_run: Literal[True]) -> AdminGcDryRunPayload: ... + def gc( + self, + *, + dry_run: Annotated[bool, "Report orphaned refs without deleting them."] = False, + ) -> AdminGcDryRunPayload | AdminGcRunPayload: + """Run local repository garbage collection or report what would be deleted.""" + with with_db(self._dml) as db: + gc_ops = make_gc_ops(db) + if dry_run: + orphans = gc_ops.list_orphans() + return {"dry_run": True, "would_delete": len(orphans), "orphans": orphans} + return {"dry_run": False, "deleted": sum(gc_ops.gc().values())} + + +class Dml: + """Shared orchestration boundary for repository, runtime, DAG, and admin workflows.""" + + def __init__( + self, + project_home: Annotated[str | None, "Project directory containing the .dml state."] = None, + *, + remote_root: Annotated[str | None, "Remote root URI such as s3://bucket/prefix."] = None, + user: Annotated[str | None, "User identity recorded for mutating operations."] = None, + config_home: Annotated[str | None, "Override directory for global DaggerML config files."] = None, + ): + """Resolve runtime context for a project-scoped DaggerML session.""" + self._context = resolve_runtime_context( + project_home=project_home, + remote_root=remote_root, + user=user, + config_home=config_home, + ) + self._s3_client = create_s3_client() if self._context.remote_root else None + + @property + def config(self) -> _ConfigNamespace: + return _ConfigNamespace(self) + + @property + def runtime(self) -> _RuntimeNamespace: + return _RuntimeNamespace(self) + + @property + def dag(self) -> _DagNamespace: + return _DagNamespace(self) + + @property + def admin(self) -> _AdminNamespace: + return _AdminNamespace(self) + + def status(self) -> StatusPayload: + """Return current HEAD, branches, visible DAGs, and open runtime workspaces.""" + if not self._context.project_home or not project_config_exists( + require_project_home(self._context.project_home) + ): + return { + "head": None, + "branches": [], + "dags": {}, + "indexes": [], + } + with with_db(self) as db: + current_head_ops = make_head_ops(db) + head_state = current_head_state(current_head_ops) + return { + "head": { + "mode": head_state.mode, + "branch": head_state.branch, + "commit": head_state.commit, + }, + "branches": current_head_ops.list_branches(), + "dags": dag_map_for_commit(self, head_state.commit), + "indexes": current_head_ops.list_indexes(), + } + + @overload + def branch(self, *, remote: Literal[False] = False) -> BranchLocalPayload: ... + @overload + def branch(self, *, remote: Literal[True]) -> BranchRemotePayload: ... + def branch( + self, + *, + remote: Annotated[bool, "List remote-tracking branches instead of local branches."] = False, + ) -> BranchLocalPayload | BranchRemotePayload: + """List local branches or discovered remote-tracking branches.""" + if remote: + return {"branches": remote_tracking_branches(self), "remote": True} + with with_db(self) as db: + current_head_ops = make_head_ops(db) + return { + "branches": current_head_ops.list_branches(), + "head": current_head_branch(current_head_ops), + "remote": False, + } + + def log( + self, + revision: Annotated[str, "Revision selector such as HEAD, HEAD~1, main, or origin/main."] = "HEAD", + *, + limit: Annotated[int | None, "Maximum number of commits to return from newest to oldest."] = None, + ) -> LogPayload: + """Return commit summaries reachable from the selected revision.""" + resolved = resolve_dml_revision(self, revision) + with with_db(self) as db: + commit_ops = make_commit_ops(db) + refs = list(commit_ops.list(resolved.commit, limit=limit)) + commits = [commit_payload(ref, commit_ops.describe(ref)) for ref in refs] + return { + "revision": revision_payload(revision, resolved), + "commits": commits, + } + + def show( + self, + revision: Annotated[str, "Revision selector such as HEAD, HEAD~1, main, or origin/main."] = "HEAD", + ) -> ShowPayload: + """Return one commit summary together with the DAG map and DAG-level diff to its first parent.""" + resolved = resolve_dml_revision(self, revision) + with with_db(self) as db: + commit = commit_payload(resolved.commit, make_commit_ops(db).describe(resolved.commit)) + dags = dag_map_for_commit(self, resolved.commit) + base_commit = commit["parents"][0] if commit["parents"] else None + base_dags = dag_map_for_commit(self, base_commit) if base_commit is not None else {} + return { + "revision": revision_payload(revision, resolved), + "commit": commit, + "dags": dags, + "change": {"base": base_commit, **dag_map_diff(base_dags, dags)}, + } + + def diff( + self, + left: Annotated[str, "Base revision selector; for example HEAD~1 or main."] = "HEAD~1", + right: Annotated[str, "Compare-against revision selector; for example HEAD or origin/main."] = "HEAD", + ) -> DiffPayload: + """Return DAG-map additions, removals, and updates between two revisions.""" + left_resolved = resolve_dml_revision(self, left) + right_resolved = resolve_dml_revision(self, right) + left_dags = dag_map_for_commit(self, left_resolved.commit) + right_dags = dag_map_for_commit(self, right_resolved.commit) + return { + "left": revision_payload(left, left_resolved), + "right": revision_payload(right, right_resolved), + **dag_map_diff(left_dags, right_dags), + } + + def checkout( + self, + revision: Annotated[str, "Revision selector to attach or detach HEAD to."], + ) -> CheckoutAttachedPayload | CheckoutDetachedPayload: + """Move HEAD to a branch or detached commit without changing repository contents.""" + resolved = resolve_dml_revision(self, revision) + with with_db(self) as db: + current_head_ops = make_head_ops(db) + if resolved.kind == "branch" and resolved.branch is not None: + current_head_ops.write_attached_head(resolved.branch) + return {"mode": "attached", "branch": resolved.branch} + current_head_ops.write_detached_head(resolved.commit) + return {"mode": "detached", "branch": None} + + def fetch( + self, + remote_or_uri: Annotated[ + str, + "Remote name like origin or explicit project URI such as dml://alice/demo.", + ], + branch: Annotated[ + str | None, "Branch selector to fetch; defaults to the active or configured default branch." + ] = None, + ) -> Ref: + """Fetch a remote branch into local history.""" + project_home = require_project_home(self._context.project_home) + uri = project_remote_root( + project_home=project_home, + remote_or_uri=remote_or_uri, + branch=branch, + default_branch=self._context.default_branch, + ) + with with_db(self) as db: + return make_remote_ops(db, self).fetch_uri(uri) + + def pull( + self, + remote_or_uri: Annotated[ + str, + "Remote name like origin or explicit project URI such as dml://alice/demo.", + ], + remote_branch: Annotated[str | None, "Remote branch selector to pull; defaults to the target branch."] = None, + *, + branch: Annotated[str | None, "Local branch to update; defaults to the active attached branch."] = None, + user: Annotated[str, "User identity recorded for the merge commit created by the pull."], + ) -> Ref: + """Fetch a remote branch and merge it into a local branch in one operation.""" + project_home = require_project_home(self._context.project_home) + with with_db(self) as db: + target_branch = mutable_branch(branch=branch, head_ops=make_head_ops(db)) + uri = project_remote_root( + project_home=project_home, + remote_or_uri=remote_or_uri, + branch=remote_branch or target_branch, + default_branch=self._context.default_branch, + ) + with with_db(self) as db: + return make_remote_ops(db, self).pull_uri_into_branch(uri, target_branch, user=user) + + def push( + self, + tag: Annotated[str | None, "Optional tag name to publish instead of pushing a branch."] = None, + *, + branch: Annotated[str | None, "Local branch to publish; defaults to the active attached branch."] = None, + create: Annotated[bool, "Allow creating a missing remote branch when pushing by branch."] = False, + force: Annotated[bool, "Allow non-fast-forward remote branch updates."] = False, + ) -> str: + """Push a branch or tag to the configured remote project and return the remote ref path.""" + project = load_project_config(require_project_home(self._context.project_home)) + if not project.uri: + raise DmlRepoError("remote.project is required for project sync") + with with_db(self) as db: + source_branch = branch or make_head_ops(db).require_attached_head_branch() + with with_db(self) as db: + remote = make_remote_ops(db, self) + if tag: + return remote.push_project_tag(f"{project.uri}@{tag}", source_branch) + return remote.push_project_branch( + f"{project.uri}#{source_branch}", source_branch, create=create, force=force + ) + + def merge( + self, + revision: Annotated[str, "Revision selector to merge into the target branch."], + *, + branch: Annotated[str | None, "Branch to update; defaults to the active attached branch."] = None, + user: Annotated[str, "User identity recorded for the merge commit."], + ): + """Merge one revision into a mutable branch.""" + revision_ref = resolve_dml_revision_ref(self, revision) + with with_db(self) as db: + target_branch = mutable_branch(branch=branch, head_ops=make_head_ops(db)) + return make_commit_ops(db).merge_into_head(target_branch, revision_ref, user) + + def revert( + self, + revision: Annotated[str, "Revision selector whose changes should be reverted."], + *, + branch: Annotated[str | None, "Branch to update; defaults to the active attached branch."] = None, + user: Annotated[str, "User identity recorded for the revert commit."], + ): + """Create a revert commit for one revision on a mutable branch.""" + revision_ref = resolve_dml_revision_ref(self, revision) + with with_db(self) as db: + target_branch = mutable_branch(branch=branch, head_ops=make_head_ops(db)) + return make_commit_ops(db).revert(target_branch, revision_ref, user) + + @classmethod + def init( + cls, + project_home: Annotated[str, "Directory to initialize as a DaggerML project."] = ".", + *, + remote_root: Annotated[str | None, "Remote root URI such as s3://bucket/prefix."] = None, + user: Annotated[str | None, "Default user identity for the initialized runtime."] = None, + config_home: Annotated[str | None, "Override directory for global DaggerML config files."] = None, + remote_project: Annotated[str | None, "Remote project URI such as dml://alice/demo to seed from."] = None, + ) -> InitPayload: + """Initialize project state, config, and database for a DaggerML repository.""" + root = Path(project_home).resolve() + if not root.exists(): + raise FileNotFoundError(f"{root} does not exist") + project_home = str(root) + dml_dir = root / ".dml" + dml_dir.mkdir(parents=True, exist_ok=True) + config_existed = project_config_exists(project_home) + db_existed = db_path_for_project(project_home).exists() + project_cfg: DmlProjectConfig + if config_existed: + project_cfg = load_project_config(project_home) + else: + if remote_project: + parsed = parse_dml_project_uri(remote_project) + project_cfg = DmlProjectConfig(name=parsed.project, owner=parsed.owner, remote_root=remote_root or "") + else: + project_cfg = DmlProjectConfig(remote_root=remote_root or "") + if not gitignore_exists(project_home): + (dml_dir / ".gitignore").write_text("db\nHEAD\nrefs\n") + if not config_existed: + project_cfg.save(root) + runtime = cls(project_home=project_home, remote_root=remote_root, user=user, config_home=config_home) + resolved_branch = runtime._context.default_branch + if not config_existed and project_cfg.remote_root != runtime._context.remote_root: + project_cfg = DmlProjectConfig( + name=project_cfg.name, + owner=project_cfg.owner, + remote_root=runtime._context.remote_root, + ) + project_cfg.save(root) + if not db_existed: + create_db(project_home, branch=resolved_branch) + project_cfg = load_project_config(project_home) + if project_cfg.remote_project and not runtime._context.remote_root: + raise DmlRepoError("remote.root is required") + if project_cfg.remote_project and runtime._context.remote_root: + try: + fetched = runtime.fetch("origin", None) + except DmlRepoError: + if config_existed and not db_existed: + raise + else: + with with_db(runtime) as db: + make_head_ops(db).write_detached_head(fetched) + return { + "project_home": project_home, + "remote_root": runtime._context.remote_root, + "user": runtime._context.user, + "config_home": runtime._context.config.config_home, + "created": {"db": not db_existed, "config": not config_existed}, + } diff --git a/src/daggerml/_internal/dml_context.py b/src/daggerml/_internal/dml_context.py new file mode 100644 index 0000000..48dde10 --- /dev/null +++ b/src/daggerml/_internal/dml_context.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from daggerml._internal.config import DmlConfig, DmlProjectConfig +from daggerml._internal.revision_uri import RevisionUri, parse_revision_uri, stringify_revision_uri +from daggerml._internal.types import DmlRepoError + + +@dataclass(frozen=True) +class DmlRuntimeContext: + config: DmlConfig + + @property + def project_home(self) -> str | None: + return self.config.project.home + + @property + def remote_root(self) -> str: + return self.config.remote.root + + @property + def user(self) -> str | None: + return self.config.user + + @property + def default_branch(self) -> str: + return self.config.default_branch + + +def resolve_runtime_context( + *, + project_home: str | None = None, + remote_root: str | None = None, + user: str | None = None, + config_home: str | None = None, +) -> DmlRuntimeContext: + config = DmlConfig.resolve( + explicit={ + "project.home": project_home, + "remote.root": remote_root, + "user": user, + "config_home": config_home, + } + ) + return DmlRuntimeContext(config) + + +def resolve_global_context( + *, + project_home: str | None = None, + user: str | None = None, + config_home: str | None = None, +) -> DmlRuntimeContext: + config = DmlConfig.resolve( + scope="global", + explicit={ + "project.home": project_home, + "user": user, + "config_home": config_home, + }, + ) + return DmlRuntimeContext(config) + + +def current_head_branch(head_ops) -> str | None: + return head_ops.get_attached_head_branch() + + +def current_head_state(head_ops): + return head_ops.get_head_state() + + +def mutable_branch(*, branch: str | None, head_ops) -> str: + return branch or head_ops.require_attached_head_branch() + + +def project_remote_root(*, project_home: str, remote_or_uri: str, branch: str | None, default_branch: str) -> str: + project = DmlProjectConfig.load(project_home) + if not project.remote_project or project.owner is None or project.name is None: + raise DmlRepoError("remote.project is required for project sync") + if remote_or_uri.startswith("dml://"): + if "#" in remote_or_uri or "@" in remote_or_uri: + return remote_or_uri + selector = parse_revision_uri(remote_or_uri, default_branch=branch or default_branch) + return stringify_revision_uri(selector) + if remote_or_uri != "origin": + raise DmlRepoError(f"Unknown remote: {remote_or_uri}") + return stringify_revision_uri(RevisionUri(project.owner, project.name, branch=branch or default_branch)) + + +def require_project_home(project_home: str | None) -> str: + if not project_home: + raise DmlRepoError("project.home is required") + return project_home + + +def require_user(user: str | None, *, message: str) -> str: + if not user: + raise DmlRepoError(message) + return user + + +def db_path_for_project(project_home: str) -> Path: + return Path(project_home) / ".dml" / "db" + + +def project_config_exists(project_home: str) -> bool: + return (Path(project_home) / ".dml" / "config.toml").exists() + + +def gitignore_exists(project_home: str) -> bool: + return (Path(project_home) / ".dml" / ".gitignore").exists() + + +def load_project_config(project_home: str) -> DmlProjectConfig: + return DmlProjectConfig.load(project_home) + + +def config_dict(config: DmlConfig) -> dict[str, Any]: + return config.to_dict() diff --git a/src/daggerml/_internal/dml_resolution.py b/src/daggerml/_internal/dml_resolution.py new file mode 100644 index 0000000..a84dee2 --- /dev/null +++ b/src/daggerml/_internal/dml_resolution.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from daggerml._internal._db import Ref +from daggerml._internal.dml_context import load_project_config +from daggerml._internal.revision_uri import parse_revision_uri +from daggerml._internal.types import Commit, DmlRepoError, Tree + + +@dataclass(frozen=True) +class ResolvedRevision: + kind: str + commit: Ref + branch: str | None = None + tag: str | None = None + + +@dataclass(frozen=True) +class ResolvedDag: + ref: Ref + value: str + revision: ResolvedRevision | None = None + + +@dataclass(frozen=True) +class ResolvedNode: + ref: Ref + value: str + dag: str | None = None + revision: ResolvedRevision | None = None + + +def _walk_first_parent(commit_ops, commit: Ref, steps: int) -> Ref: + current = commit + for _ in range(steps): + info = commit_ops.describe(current) + parents = info["parents"] + if not parents: + raise DmlRepoError(f"Revision ancestry walks past the root commit: {current}") + current = parents[0] + return current + + +def _resolve_head(head_ops) -> Ref: + return head_ops.resolve_head_commit() + + +def _resolve_local_tag_ref(value: str, head_ops): + try: + return head_ops.get_branch_commit(value) + except DmlRepoError: + return None + + +def _resolve_project_tag_ref(value: str, *, head_ops, project_dir: str): + try: + project = load_project_config(project_dir) + except Exception: + return None + if not project.remote_project: + return None + return _resolve_local_tag_ref(f"{project.remote_project}@{value}", head_ops) + + +def _reject_ref_like_selector(value: str, expected_root_ns: str) -> None: + if ":" not in value: + return + try: + candidate = Ref(value) + except Exception: + return + if candidate.nss()[0] == expected_root_ns: + raise DmlRepoError(f"Expected {expected_root_ns} Ref, got ref-like selector string: {value}") + + +def _list_commit_dags(*, commit: Ref, commit_ops) -> dict[str, Ref]: + with commit_ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit) + if not isinstance(commit_obj, Commit): + raise DmlRepoError(f"Expected Commit at {commit}, got {type(commit_obj)}") + tree = txn.get(commit_obj.tree) + if not isinstance(tree, Tree): + raise DmlRepoError(f"Expected Tree at {commit_obj.tree}, got {type(tree)}") + return dict(tree.dags) + + +def resolve_revision(*, value: str, commit_ops, head_ops, project_dir: str) -> ResolvedRevision: + if isinstance(value, Ref): + if value.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {value}") + return ResolvedRevision(kind="commit", commit=value) + + if not isinstance(value, str) or not value: + raise DmlRepoError("Revision is required") + + if value.startswith("commit:"): + commit = Ref(value) + if commit.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {commit}") + commit_ops.describe(commit) + return ResolvedRevision(kind="commit", commit=commit) + + if value.startswith("dml://"): + try: + parsed = parse_revision_uri(value, require_identifier=True) + except ValueError as exc: + raise DmlRepoError(str(exc)) from exc + local_name = value + commit = _resolve_local_tag_ref(local_name, head_ops) + if commit is None and parsed.branch is not None: + commit = _resolve_local_tag_ref(local_name, head_ops) + if commit is None: + raise DmlRepoError(f"Revision {value!r} cannot be resolved locally") + kind = "branch" if parsed.branch is not None else "tag" + return ResolvedRevision(kind=kind, commit=commit, branch=parsed.branch, tag=parsed.tag) + + if value.startswith("HEAD"): + base = _resolve_head(head_ops) + if value == "HEAD": + state = head_ops.get_head_state() + return ResolvedRevision(kind="branch" if state.branch else "commit", commit=base, branch=state.branch) + if not value.startswith("HEAD~"): + raise DmlRepoError(f"Unsupported revision: {value}") + try: + steps = int(value[5:], 10) + except ValueError as exc: + raise DmlRepoError(f"Unsupported revision: {value}") from exc + return ResolvedRevision(kind="commit", commit=_walk_first_parent(commit_ops, base, steps)) + + if len(value) == 64 and all(ch in "0123456789abcdef" for ch in value): + commit = Ref(f"commit:{value}") + commit_ops.describe(commit) + return ResolvedRevision(kind="commit", commit=commit) + + try: + commit = head_ops.get_branch_commit(value) + except DmlRepoError: + commit = _resolve_project_tag_ref(value, head_ops=head_ops, project_dir=project_dir) + if commit is not None: + return ResolvedRevision(kind="tag", commit=commit, tag=value) + raise + return ResolvedRevision(kind="branch", commit=commit, branch=value) + + +def resolve_revision_ref(*, value: str, commit_ops, head_ops, project_dir: str) -> Ref: + return resolve_revision(value=value, commit_ops=commit_ops, head_ops=head_ops, project_dir=project_dir).commit + + +def resolve_dag_ref( + *, + value: str, + revision: str | None = None, + commit_ops, + head_ops, + project_dir: str, + operation: str, +) -> ResolvedDag: + if not isinstance(value, str) or not value: + raise DmlRepoError("DAG name is required") + _reject_ref_like_selector(value, "dag") + + resolved = resolve_revision( + value=revision or "HEAD", + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=project_dir, + ) + resolved_dag_ref = commit_ops.get_dag(resolved.commit, value) + if resolved_dag_ref is None: + raise DmlRepoError(f"DAG '{value}' not found") + return ResolvedDag(ref=resolved_dag_ref, value=value, revision=resolved) + + +def resolve_node_ref( + *, + value: str, + dag: str | None = None, + revision: str | None = None, + commit_ops, + dag_ops, + head_ops, + project_dir: str, + operation: str, +) -> ResolvedNode: + if not isinstance(value, str) or not value: + raise DmlRepoError("Node name is required") + _reject_ref_like_selector(value, "node") + + if dag is not None: + resolved_dag = resolve_dag_ref( + value=dag, + revision=revision, + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=project_dir, + operation=operation, + ) + return ResolvedNode( + ref=dag_ops.get_node(resolved_dag.ref, value), + value=value, + dag=resolved_dag.value, + revision=resolved_dag.revision, + ) + + resolved_revision = resolve_revision( + value=revision or "HEAD", + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=project_dir, + ) + matches: list[tuple[str, Ref]] = [] + for dag_name, dag_ref in _list_commit_dags(commit=resolved_revision.commit, commit_ops=commit_ops).items(): + try: + dag_ops.get_node(dag_ref, value) + except DmlRepoError: + continue + matches.append((dag_name, dag_ref)) + + if not matches: + raise DmlRepoError(f"Node '{value}' not found at revision {revision or 'HEAD'}") + if len(matches) > 1: + dag_names = ", ".join(name for name, _dag_ref in matches) + raise DmlRepoError( + f"dml dag {operation} requires dag for ambiguous node lookup '{value}' (matches: {dag_names})" + ) + + matched_name, matched_dag_ref = matches[0] + return ResolvedNode( + ref=dag_ops.get_node(matched_dag_ref, value), + value=value, + dag=matched_name, + revision=resolved_revision, + ) diff --git a/src/daggerml/_internal/exec_state.py b/src/daggerml/_internal/exec_state.py new file mode 100644 index 0000000..965e710 --- /dev/null +++ b/src/daggerml/_internal/exec_state.py @@ -0,0 +1,466 @@ +"""S3-backed execution coordination and lineage helpers. + +Public API: + AdapterIO - Scoped S3 stdin/stdout surrogate for fire-and-monitor executors + ExecutionState - S3-backed lock + execution metadata helper + CancelledExecutionError - Raised when execution updates are interrupted by cancellation + LaunchState - TypedDict for caller-owned resumable launch objects + ExecutionRecord - TypedDict for runtime-owned execution lifecycle objects + LockRecord - TypedDict for the lock file contents + LOCK_TTL - Default lock time-to-live in seconds +""" + +from __future__ import annotations + +import json +import time +from typing import Any, Literal, TypedDict, cast +from urllib.parse import urlparse +from uuid import uuid4 + +import boto3 +from botocore.config import Config + +from daggerml._internal.types import DmlRepoError + +LOCK_TTL: float = 300.0 +S3_MAX_POOL_CONNECTIONS = 20 + + +class LockRecord(TypedDict): + lock_token: str + lock_expires_ts: float + + +class CancelledExecutionError(Exception): + pass + + +class LaunchState(TypedDict): + execution_id: str + cache_key: str + created_at: int + resume_state: dict[str, Any] + + +class ExecutionRecord(TypedDict): + execution_id: str + cache_key: str + lifecycle: Literal["running", "cancel-pending", "cancel-detached", "succeeded", "failed"] + updated_at: int + spawned_execution_ids: list[str] + cancellation_requested_by: str | None + + +class AdapterIO: + """Scoped S3 stdin/stdout surrogate for fire-and-monitor executors. + + Used by executors that launch a sub-adapter as a detached process (e.g. + Docker container, AWS Batch job) where direct stdin/stdout piping is not + possible. Paths are derived deterministically from ``(cache_key, exec_id, + name)`` so both ``start()`` and ``poll()`` can access the same objects + without storing URIs in executor state. + + All keys live under ``{protocol-prefix}/io/{cache_key}/{exec_id}/{name}/``. + + Obtain via ``ExecutionState.adapter_io(exec_id, name)`` — do not construct + directly. + + Parameters + ---------- + exec_id: + UUID identifying the current execution attempt. + name: + Caller-chosen identifier, conventionally ``"{adapter}:{executor}"`` + (e.g. ``"local:docker"``, ``"lambda:batch"``). + """ + + def __init__(self, state: "ExecutionState", exec_id: str, name: str) -> None: + prefix = f"{state._exec_prefix}/io/{state.cache_key}/{exec_id}/{name}" + self._state = state + self._input_key = f"{prefix}/input.json" + self._output_key = f"{prefix}/output.json" + + @property + def input_uri(self) -> str: + """S3 URI for the sub-adapter input payload (no S3 call made).""" + return f"s3://{self._state._bucket}/{self._input_key}" + + @property + def output_uri(self) -> str: + """S3 URI for the sub-adapter output result (no S3 call made).""" + return f"s3://{self._state._bucket}/{self._output_key}" + + def write_input(self, data: bytes) -> str: + """Write ``data`` to the input S3 key and return ``input_uri``.""" + self._state._put_object(self._input_key, data) + return self.input_uri + + def read_output(self) -> bytes | None: + """Read the output S3 key. Returns ``None`` if not yet written.""" + result = self._state._get_object_bytes(self._output_key) + return result[0] if result is not None else None + + + +class ExecutionState: + """S3-backed advisory mutex for function execution. + + Function-execution coordination lives under ``{prefix}/dml/`` with: + + - ``locks/{cache_key}.json`` for the advisory mutex, + - ``active/{cache_key}`` for the active execution id, + - ``exec/state/{execution_id}.json`` for mutable execution state, + - ``exec/edges//.json`` for canonical lineage, + - ``exec/invalidate/{execution_id}.json`` for invalidation tombstones, + - ``io/{cache_key}/{exec_id}/{name}/`` for adapter I/O (see :class:`AdapterIO`). + + Lock lifecycle is create-only (``If-None-Match: *``) then delete — no + updates. + + Parameters + ---------- + cache_key: + Unique identifier for this execution (typically the argv_ref id). + remote_root: + S3 URI of the form ``s3://bucket[/prefix]``. Raises ``DmlRepoError`` + if absent or malformed. + """ + + def __init__(self, cache_key: str, *, remote_root: str) -> None: + if not isinstance(cache_key, str) or not cache_key: + raise DmlRepoError("ExecutionState cache_key must be a non-empty string") + parsed = urlparse(remote_root) + if parsed.scheme != "s3" or not parsed.netloc: + raise DmlRepoError( + f"ExecutionState remote_root must be a valid s3:// URI, got: {remote_root!r}" + ) + bucket = parsed.netloc + prefix = parsed.path.strip("/") + exec_prefix = f"{prefix}/dml" if prefix else "dml" + self.cache_key = cache_key + self._bucket = bucket + self._exec_prefix = exec_prefix + self._lock_key = f"{exec_prefix}/locks/{cache_key}.json" + self._active_key = f"{exec_prefix}/active/{cache_key}" + self._lock_token: str | None = None + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _s3(): + return boto3.client("s3", config=Config(max_pool_connections=S3_MAX_POOL_CONNECTIONS)) + + def _key_for_launch_state(self, execution_id: str) -> str: + return f"{self._exec_prefix}/exec/launch/{execution_id}.json" + + def _key_for_execution(self, execution_id: str) -> str: + return f"{self._exec_prefix}/exec/state/{execution_id}.json" + + def _key_for_edge(self, callee_execution_id: str, caller_execution_id: str) -> str: + return f"{self._exec_prefix}/exec/edges/{callee_execution_id}/{caller_execution_id}.json" + + def _key_for_edge_prefix(self, callee_execution_id: str) -> str: + return f"{self._exec_prefix}/exec/edges/{callee_execution_id}/" + + def _key_for_invalidation(self, execution_id: str) -> str: + return f"{self._exec_prefix}/exec/invalidate/{execution_id}.json" + + def _key_for_cancellation_tombstone(self, execution_id: str) -> str: + return f"{self._exec_prefix}/exec/cancelled/{execution_id}.json" + + def _get_object_bytes(self, key: str) -> tuple[bytes, str] | None: + """Return object bytes and ETag, or None if the file does not exist.""" + try: + resp = self._s3().get_object(Bucket=self._bucket, Key=key) + return resp["Body"].read(), resp["ETag"].strip('"') + except Exception as e: + code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + if code in ("NoSuchKey", "404"): + return None + raise + + def _get_object(self) -> LockRecord | None: + payload = self._get_object_bytes(self._lock_key) + return None if payload is None else json.loads(payload[0]) + + def _put_object(self, key: str, body: bytes, *, if_match: str | None = None, if_none_match: bool = False) -> bool: + """Conditional PUT. Returns False on precondition failure.""" + try: + kwargs: dict[str, Any] = { + "Bucket": self._bucket, + "Key": key, + "Body": body, + } + if key.endswith(".json"): + kwargs["ContentType"] = "application/json" + if if_match is not None: + kwargs["IfMatch"] = if_match + if if_none_match: + kwargs["IfNoneMatch"] = "*" + self._s3().put_object(**kwargs) + return True + except Exception as e: + code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + if code in ("PreconditionFailed", "412"): + return False + raise + + def _put_object_if_absent(self, record: LockRecord) -> bool: + """PUT with ``If-None-Match: *``. Returns True on success, False on 412.""" + return self._put_object( + self._lock_key, + json.dumps(record, separators=(",", ":"), sort_keys=True).encode(), + if_none_match=True, + ) + + def _delete_object(self, key: str) -> None: + """DELETE the lock file; no-op if already absent.""" + try: + self._s3().delete_object(Bucket=self._bucket, Key=key) + except Exception as e: + code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + if code in ("NoSuchKey", "404"): + return + raise + + def _read_json(self, key: str) -> tuple[Any, str] | tuple[None, None]: + payload = self._get_object_bytes(key) + if payload is None: + return None, None + return json.loads(payload[0]), payload[1] + + def _write_json_if_absent(self, key: str, value: Any) -> bool: + return self._put_object( + key, + json.dumps(value, separators=(",", ":"), sort_keys=True).encode(), + if_none_match=True, + ) + + def _write_json_if_match(self, key: str, value: Any, etag: str) -> bool: + return self._put_object( + key, + json.dumps(value, separators=(",", ":"), sort_keys=True).encode(), + if_match=etag, + ) + + def _write_json(self, key: str, value: Any) -> None: + self._put_object(key, json.dumps(value, separators=(",", ":"), sort_keys=True).encode()) + + @staticmethod + def _lifecycle_rank(lifecycle: str) -> int: + ranks = { + "running": 0, + "cancel-pending": 1, + "cancel-detached": 2, + "succeeded": 3, + "failed": 3, + } + if lifecycle not in ranks: + raise DmlRepoError(f"Invalid execution lifecycle: {lifecycle}") + return ranks[lifecycle] + + def _merge_execution_record(self, current: ExecutionRecord, incoming: ExecutionRecord) -> ExecutionRecord: + lifecycle = current["lifecycle"] + if self._lifecycle_rank(incoming["lifecycle"]) > self._lifecycle_rank(current["lifecycle"]): + lifecycle = incoming["lifecycle"] + spawned_execution_ids = sorted( + {*current.get("spawned_execution_ids", []), *incoming.get("spawned_execution_ids", [])} + ) + cancellation_requested_by = current["cancellation_requested_by"] or incoming["cancellation_requested_by"] + merged: ExecutionRecord = { + "execution_id": current["execution_id"], + "cache_key": current["cache_key"], + "lifecycle": lifecycle, + "updated_at": max(current["updated_at"], incoming["updated_at"]), + "spawned_execution_ids": spawned_execution_ids, + "cancellation_requested_by": cancellation_requested_by, + } + return merged + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def adapter_io(self, exec_id: str, name: str) -> AdapterIO: + """Return a scoped :class:`AdapterIO` for the given execution attempt. + + Parameters + ---------- + exec_id: + UUID identifying the current execution attempt. + name: + Caller-chosen identifier, conventionally ``"{adapter}:{executor}"`` + (e.g. ``"local:docker"``, ``"lambda:batch"``). + """ + return AdapterIO(self, exec_id, name) + + def lock(self, ttl: float = LOCK_TTL) -> bool: + """Acquire the advisory lock. + + Algorithm: + 1. GET existing file. + 2. If absent: PUT with ``If-None-Match: *``. + 3. If present and **expired**: DELETE then PUT. + 4. If present and **held**: return ``False``. + 5. A 412 from S3 on step 2 means a concurrent writer won — return ``False``. + + Returns True on success, False if the lock is currently held. + """ + now = time.time() + existing = self._get_object() + + if existing is not None: + if existing["lock_expires_ts"] > now: + # Lock is currently held by someone else + return False + # Lock is expired — steal it + self._delete_object(self._lock_key) + + token = str(uuid4()) + record: LockRecord = { + "lock_token": token, + "lock_expires_ts": now + ttl, + } + if not self._put_object_if_absent(record): + # 412 — concurrent writer grabbed it first + return False + + self._lock_token = token + return True + + def unlock(self) -> None: + """Release the advisory lock by deleting the lock file. + + This is a best-effort delete; if the file is already absent (e.g. + expired and stolen), the call is a no-op. + """ + self._lock_token = None + self._delete_object(self._lock_key) + + def read_active_execution_id(self) -> str | None: + payload = self._get_object_bytes(self._active_key) + if payload is None: + return None + raw = payload[0].decode().strip() + if not raw: + return None + return raw + + def create_active_execution(self, execution_id: str) -> bool: + return self._put_object(self._active_key, execution_id.encode(), if_none_match=True) + + def delete_active_execution(self) -> None: + self._delete_object(self._active_key) + + def read_launch_state(self, execution_id: str) -> LaunchState | None: + payload = self._get_object_bytes(self._key_for_launch_state(execution_id)) + if payload is None: + return None + return json.loads(payload[0]) + + def create_launch_state(self, launch_state: LaunchState) -> bool: + return self._write_json_if_absent(self._key_for_launch_state(launch_state["execution_id"]), launch_state) + + def update_launch_state(self, launch_state: LaunchState) -> LaunchState: + self._write_json(self._key_for_launch_state(launch_state["execution_id"]), launch_state) + return launch_state + + def read_execution_record(self, execution_id: str) -> ExecutionRecord | None: + payload = self._get_object_bytes(self._key_for_execution(execution_id)) + if payload is None: + return None + return json.loads(payload[0]) + + def create_execution_record(self, record: ExecutionRecord) -> bool: + return self._write_json_if_absent(self._key_for_execution(record["execution_id"]), record) + + def update_execution_record(self, record: ExecutionRecord, *, retries: int = 8) -> ExecutionRecord: + key = self._key_for_execution(record["execution_id"]) + for _ in range(retries): + current, etag = self._read_json(key) + if current is None: + if self._write_json_if_absent(key, record): + return record + else: + current_record = cast(ExecutionRecord, current) + if current_record["lifecycle"].startswith("cancel-") and not record["lifecycle"].startswith("cancel-"): + raise CancelledExecutionError(f"Execution cancelled: {record['execution_id']}") + merged = self._merge_execution_record(current_record, record) + if self._write_json_if_match(key, merged, cast(str, etag)): + return merged + raise DmlRepoError(f"Failed to update execution state object: {key}") + + def record_execution_dependency( + self, + *, + caller_execution_id: str, + callee_execution_id: str, + retries: int = 8, + ) -> None: + edge = { + "caller_execution_id": caller_execution_id, + "callee_execution_id": callee_execution_id, + } + key = self._key_for_edge(callee_execution_id, caller_execution_id) + for _ in range(retries): + if self._write_json_if_absent(key, edge): + return + existing, _etag = self._read_json(key) + if existing == edge: + return + raise DmlRepoError(f"Failed to write execution edge object: {key}") + + def delete_execution_dependency(self, *, caller_execution_id: str, callee_execution_id: str) -> None: + self._delete_object(self._key_for_edge(callee_execution_id, caller_execution_id)) + + def create_invalidation_record( + self, + *, + execution_id: str, + cache_key: str, + requested_by: str, + requested_at: int, + ) -> bool: + return self._write_json_if_absent( + self._key_for_invalidation(execution_id), + { + "execution_id": execution_id, + "cache_key": cache_key, + "requested_by": requested_by, + "requested_at": requested_at, + }, + ) + + def create_cancellation_tombstone( + self, *, execution_id: str, cache_key: str, requested_by: str, requested_at: int + ) -> bool: + return self._write_json_if_absent( + self._key_for_cancellation_tombstone(execution_id), + { + "execution_id": execution_id, + "cache_key": cache_key, + "requested_by": requested_by, + "requested_at": requested_at, + }, + ) + + def list_execution_callers(self, callee_execution_id: str) -> list[str]: + prefix = self._key_for_edge_prefix(callee_execution_id) + paginator = self._s3().get_paginator("list_objects_v2") + callers: list[str] = [] + for page in paginator.paginate(Bucket=self._bucket, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + if not key.endswith(".json"): + continue + payload = self._get_object_bytes(key) + if payload is None: + continue + edge = json.loads(payload[0]) + caller = edge.get("caller_execution_id") + if isinstance(caller, str) and caller: + callers.append(caller) + return callers diff --git a/src/daggerml/_internal/execution_context.py b/src/daggerml/_internal/execution_context.py new file mode 100644 index 0000000..e53dfa5 --- /dev/null +++ b/src/daggerml/_internal/execution_context.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from contextlib import contextmanager +from contextvars import ContextVar +from typing import Iterator + +_CURRENT_EXECUTION_ID: ContextVar[str | None] = ContextVar("daggerml_current_execution_id", default=None) +_CURRENT_CACHE_KEY: ContextVar[str | None] = ContextVar("daggerml_current_cache_key", default=None) + + +def get_current_execution_context() -> tuple[str | None, str | None]: + return _CURRENT_EXECUTION_ID.get(), _CURRENT_CACHE_KEY.get() + + +@contextmanager +def execution_context(execution_id: str | None, cache_key: str | None) -> Iterator[None]: + execution_token = _CURRENT_EXECUTION_ID.set(execution_id) + cache_token = _CURRENT_CACHE_KEY.set(cache_key) + try: + yield + finally: + _CURRENT_EXECUTION_ID.reset(execution_token) + _CURRENT_CACHE_KEY.reset(cache_token) diff --git a/src/daggerml/_internal/ops/__init__.py b/src/daggerml/_internal/ops/__init__.py new file mode 100644 index 0000000..4f9949e --- /dev/null +++ b/src/daggerml/_internal/ops/__init__.py @@ -0,0 +1 @@ +"""Transactional repository operation modules.""" diff --git a/src/daggerml/_internal/ops/base_ops.py b/src/daggerml/_internal/ops/base_ops.py new file mode 100644 index 0000000..b001ee0 --- /dev/null +++ b/src/daggerml/_internal/ops/base_ops.py @@ -0,0 +1,438 @@ +"""Foundation class providing core repository operations shared across all subsystems. + +This module provides BaseOps, a base class that encapsulates common database +operations used by all repository subsystems. It handles transactions, +object storage/retrieval, and reference management. + +Public API: + BaseOps - Base class for repository operations +""" + +import logging +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Any, Iterator, List, Optional, cast + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +from daggerml._internal._db import ( + DmlDbEnv, + DmlDbEnvReopenedError, + DmlDbEnvTxn, + DmlDbKeyNotFoundError, + DmlDbMapFullError, + Ref, +) +from daggerml._internal.types import NAMESPACES, Commit, Dag, Deletable, DmlRepoError, Error, Tree, Uri + + +@dataclass +class CommitCtx: + """Context helper for accessing commit/tree/dag state without pointers.""" + + commit: Commit + tree: Tree + dag: Dag = cast(Dag, None) + + +def with_retry(fn): + """Decorator to retry transactions on recoverable database errors. + + Handles two types of recoverable errors: + 1. DmlDbMapFullError: Database is full - automatically resize and retry + 2. DmlDbEnvReopenedError: Environment was repaired (e.g., after fork/EINVAL) - retry transaction + + Parameters + ---------- + fn : Callable + The function to wrap. + + Returns + ------- + Callable + The wrapped function. + """ + + def wrapper(self, *args, **kwargs): + retries = 0 + max_retries = 8 + while True: + try: + return fn(self, *args, **kwargs) + except DmlDbMapFullError: + map_size = self._db.get_size() + new_map_size = map_size * 2 + logging.warning("Resizing database from %d to %d bytes", map_size, new_map_size) + self._db.resize(new_map_size) + retries += 1 + if retries >= max_retries: + raise + except DmlDbEnvReopenedError as e: + # Environment was repaired (all transactions invalidated), retry the operation + retries += 1 + if retries >= max_retries: + raise + logging.info( + "Database environment reopened, retrying transaction (attempt %d/%d): %s", retries, max_retries, e + ) + + return wrapper + + +@dataclass +class TxnContext: + """Transaction context for BaseOps operations. + + Attributes + ---------- + txn : DmlDbEnvTxn + The active database transaction. + logger : logging.Logger + Logger for the transaction context. + """ + + db: DmlDbEnv + txn: DmlDbEnvTxn + logger: logging.Logger + + def put(self, obj: Any, to: Optional[Ref] = None) -> Ref: + """Store object and return its reference. + + Parameters + ---------- + obj : Any + Object to store. + to : Optional[Ref] + Optional reference to store at. If None, generates new reference. + + Returns + ------- + Ref + Reference to the stored object. + + Raises + ------ + DmlRepoError + If the object cannot be stored. + """ + if isinstance(obj, Ref): + # For Ref objects, we store them directly without validation/serialization + # This is used for head pointers that store Ref values + ns = to.ns() if to else None + if ns and ns not in self.db.namespaces: + raise ValueError(f"Unknown namespace: {ns}") + return self.txn.put(obj, ns=ns, to=to) + obj._validate() + ns = None + if to is None: + ns = obj._ns + if (ns or to.ns()) not in self.db.namespaces: + raise ValueError(f"Unknown namespace: {ns}") + ref = self.txn.put(obj.to_dict(), ns=ns, to=to) + if isinstance(obj, Uri): + self._cleanup_opposite_entry(ref, opposite_ns="deletable", noun="uri") + elif isinstance(obj, Deletable): + self._cleanup_opposite_entry(ref, opposite_ns="datum-uri", noun="deletable") + return ref + + def get(self, ref: Ref) -> Any: + """Retrieve object by reference. + + Parameters + ---------- + ref : Ref + Reference to the object to retrieve. + + Returns + ------- + Any + The object stored at the reference. + + Raises + ------ + DmlRepoError + If the reference is invalid or object cannot be retrieved. + """ + try: + ns = ref.ns() + cls = NAMESPACES.get(ns) + if cls is None: + raise ValueError(f"Unknown namespace: {ns}") + obj = cls.from_dict(self.txn.get(ref)) + return obj + except DmlDbKeyNotFoundError: + # Explicitly silence exception chaining for not-found case + raise DmlRepoError(f"Object not found: {ref}") from None + except Error: + self.logger.exception(f"get: ref={ref}") + raise + except Exception as e: + self.logger.exception(f"get: ref={ref}") + raise DmlRepoError(f"Failed to get object: {e}") from e + + def delete(self, ref: Ref) -> None: + """Delete object at reference. + + Parameters + ---------- + ref : Ref + Reference to the object to delete. + + Raises + ------ + DmlRepoError + If the object cannot be deleted. + """ + try: + self.txn.delete(ref) + except Exception as e: + self.logger.exception(f"delete: ref={ref}") + raise DmlRepoError(f"Failed to delete object: {e}") from e + + def exists(self, ref: Ref) -> bool: + """Check if object exists at reference. + + Parameters + ---------- + ref : Ref + Reference to check. + + Returns + ------- + bool + True if object exists, False otherwise. + + Raises + ------ + DmlRepoError + If existence check fails. + """ + try: + return self.txn.exists(ref) + except Exception as e: + self.logger.exception(f"exists: ref={ref}") + raise DmlRepoError(f"Failed to check object existence: {e}") from e + + def iter(self, namespace: str) -> Iterator[Ref]: + """Iterate over objects in a namespace. + + Parameters + ---------- + namespace : str + Namespace to iterate over. + + Yields + ------ + Ref + References to objects in the namespace. + + Raises + ------ + DmlRepoError + If iteration fails. + """ + try: + for entry in self.txn.iter(namespace): + if isinstance(entry, tuple): + yield entry[0] + else: + yield entry + except DmlDbKeyNotFoundError: + self.logger.info("No objects found in the repository.") + except Exception as e: + self.logger.exception(f"iter: namespace={namespace}") + raise DmlRepoError(f"Failed to iterate over namespace '{namespace}': {e}") from e + + def load_dict(self, manifest: dict) -> Ref: + """Load an object from a commit-manifest JSON payload. + + Parses the commit-manifest JSON structure and restores the object graph. + + Parameters + ---------- + manifest : dict + Dictionary representing the commit-manifest structure. + + Returns + ------- + Ref + Reference to the loaded object. + + Raises + ------ + DmlRepoError + If the load operation fails. + """ + try: + # Validate manifest structure + if manifest.get("schema") != 0: + raise DmlRepoError(f"Unsupported schema version: {manifest.get('schema')}") + if manifest.get("kind") != "local-manifest": + raise DmlRepoError(f"Invalid manifest kind: {manifest.get('kind')}") + ns = manifest.get("root-ns") + if ns is None: + raise DmlRepoError("Manifest missing namespace") + id_ = manifest.get("root-id") + if id_ is None: + raise DmlRepoError(f"Manifest missing {ns} ID") + if id_ not in manifest.get("closure", {}).get(ns, {}): + raise DmlRepoError(f"Manifest closure missing {ns} ID {id_}") + retval = Ref(f"{ns}:{id_}") + dump_list = [] + closure = manifest.get("closure", {}) + for ns, objects in closure.items(): + for obj_id, dump in objects.items(): + dump_list.append({"id": obj_id, "ns": ns, "dump": dump}) + self._load_from_list(dump_list) + return retval + except Exception as e: + raise DmlRepoError(f"Failed to load object: {e}") from e + + def _load_from_list(self, dump_list: List[dict]) -> Ref: + """Internal method to load objects from list of dicts.""" + root_ref = None + for item in dump_list: + expected_id = item["id"] + # Insert without specifying id, let database generate it + inserted_ref = self.txn.put(item["dump"], ns=item["ns"], raw=True) + # Confirm the id matches + if inserted_ref.id() != expected_id: + raise DmlRepoError(f"ID mismatch during load: expected {expected_id}, got {inserted_ref.id()}") + # Keep track of the first (root) ref + if root_ref is None: + root_ref = inserted_ref + if root_ref is None: + raise DmlRepoError("Empty dump list") + return root_ref + + def _cleanup_opposite_entry(self, ref: Ref, *, opposite_ns: str, noun: str) -> None: + """Remove a stale opposite entry for the given reference. + + Parameters + ---------- + ref : Ref + Reference to clean up opposite entries for. + opposite_ns : str + Namespace of the opposite entry to remove. + noun : str + Object type for logging context. + """ + try: + opposite_ref = Ref(f"{opposite_ns}:{ref.id()}") + if self.txn.exists(opposite_ref): + self.logger.warning( + "Clearing %s entry %s to keep %s/%s mutually exclusive (new: %s)", + noun, + opposite_ref, + ref.ns(), + opposite_ns, + ref, + ) + self.txn.delete(opposite_ref) + except Exception as e: + self.logger.exception(f"cleanup_opposite_entry: ref={ref}") + raise DmlRepoError(f"Failed to cleanup opposite entry: {e}") from e + + def _log_operation(self, operation: str, **kwargs) -> None: + """Log operation with context. + + Parameters + ---------- + operation : str + Name of the operation being logged. + **kwargs + Additional context to include in log message. + """ + context = ", ".join(f"{k}={v}" for k, v in kwargs.items()) + self.logger.exception(f"{operation}: {context}") + + def get_commit_ctx(self, commit_ref: Ref) -> CommitCtx: + """Create context helper from a commit reference.""" + + commit: Commit = self.get(commit_ref) + tree: Tree = self.get(commit.tree) + dag = commit.dag and self.get(commit.dag) + return CommitCtx(commit, tree, dag=cast(Dag, dag)) + + +@dataclass +class BaseOps: + """Foundation class providing core repository operations. + + This class encapsulates common database operations and provides + a consistent interface for all repository subsystems. It handles + transactions, object storage/retrieval, and reference management. + + It is intended as a helper base class and should not be used directly. + None of its methods nor attributes are part of the public API. + + Attributes + ---------- + db : DmlDbEnv + Database environment instance. + """ + + _db: DmlDbEnv + + def __post_init__(self): + """Initialize logger after dataclass initialization.""" + self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + @contextmanager + def _tx(self, readonly: bool = False): + """Transaction context manager. + + Provides a transaction context that can be used to perform multiple + operations atomically. Temporarily binds this BaseOps instance to the + transaction-bound database for use with other operations. + + Parameters + ---------- + readonly : bool + If True, opens a read-only transaction. Default is False. + + Yields + ------ + Self + The BaseOps instance bound to the transaction. + + Raises + ------ + DmlRepoError + If transaction cannot be created or operations fail. + """ + try: + with self._db.tx(readonly=readonly) as txn: + self._logger.debug("Nested transactions are not supported. Readonly flag will be ignored.") + yield TxnContext(db=self._db, txn=txn, logger=self._logger) + except DmlDbMapFullError: + # Allow upstream retry managers (e.g. with_retry) to replay the full txn. + raise + except DmlDbEnvReopenedError: + # Allow upstream retry managers (e.g. with_retry) to replay the full txn. + raise + except Error: + raise + except Exception as e: + self._logger.exception(f"Transaction failed: readonly={readonly}") + raise DmlRepoError(f"Transaction failed: {e}") from e + + def _with_ops(self, **changes) -> Self: + """Create a copy of this BaseOps with modified attributes. + + Parameters + ---------- + **changes + Attributes to modify in the new instance. + + Returns + ------- + BaseOps + New BaseOps instance with modified attributes. + """ + for k, v in changes.items(): + setattr(self, k, v) + return self diff --git a/src/daggerml/_internal/ops/cache.py b/src/daggerml/_internal/ops/cache.py new file mode 100644 index 0000000..1418d36 --- /dev/null +++ b/src/daggerml/_internal/ops/cache.py @@ -0,0 +1,117 @@ +"""Cache operations for managing computation results. + +Public API: + CacheOps - Class for cache management operations +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterator, Optional +from urllib.parse import urlparse + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.types import DmlRepoError + + +@dataclass +class CacheOps(BaseOps): + """CRUD operations for managing cached computation results.""" + + remote_root: str + + def _remote_ops(self): + if not self.remote_root: + raise DmlRepoError("Remote cache context required") + from daggerml._internal.ops.remote import RemoteOps + + parsed = urlparse(self.remote_root) + if parsed.scheme != "s3" or not parsed.netloc: + raise DmlRepoError(f"Invalid remote root URI: {self.remote_root!r}") + prefix = parsed.path.strip("/") + return RemoteOps(_db=self._db, bucket=parsed.netloc, prefix=f"{prefix}/dml" if prefix else "dml") + + def _require_remote_context(self): + return self._remote_ops() + + @staticmethod + def _cache_key(argv_ref: Ref, txn) -> str: + if argv_ref.ns() != "node-argv": + raise DmlRepoError(f"Expected argv ref for cache key, got: {argv_ref}") + argv_datum_ref = txn.get(argv_ref).datum_ref(txn) + if argv_datum_ref.ns() != "datum-list": + raise DmlRepoError(f"Expected argv list datum ref for cache key, got: {argv_datum_ref}") + return argv_datum_ref.id() + + def _get(self, argv_ref: Ref, txn) -> Optional[Ref]: + """Get cached result for `argv_ref` within a transaction.""" + remote_ops = self._require_remote_context() + cache_key = self._cache_key(argv_ref, txn) + target = remote_ops.get_cache_ref(cache_key) + if target is None: + return None + return remote_ops.load_ptr_in_txn(target, txn, expected_root_ns="dag") + + def put(self, dag_ref: Ref, *, execution_id: str) -> str: + """Create or overwrite a cache entry for `dag_ref`.""" + try: + if dag_ref.ns() != "dag": + raise DmlRepoError(f"Expected dag ref for cache value, got: {dag_ref}") + if not isinstance(execution_id, str) or not execution_id: + raise DmlRepoError("Execution id required for cache entry publication") + remote_ops = self._require_remote_context() + with self._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + argv_ref = dag.argv + if argv_ref is None: + raise DmlRepoError(f"DAG {dag_ref} has no argv, cannot cache") + cache_key = self._cache_key(argv_ref, txn) + targets = remote_ops._targets_for_root(txn, dag_ref) + target = remote_ops.put_ref_manifest(dag_ref) + remote_ops.put_cache_ref(cache_key, target, targets=targets, execution_id=execution_id) + return cache_key + except Exception as e: + raise DmlRepoError(f"Failed to put cache entry: {e}") from e + + def get(self, argv_ref: Ref) -> Optional[Ref]: + """Get cached result for `argv_ref`.""" + try: + with self._tx(readonly=False) as txn: + return self._get(argv_ref, txn) + except Exception as e: + raise DmlRepoError(f"Failed to get cache entry: {e}") from e + + def delete(self, argv_ref: Ref) -> bool: + """Delete cache entry for `argv_ref`, returning whether it existed.""" + try: + remote_ops = self._require_remote_context() + with self._tx(readonly=True) as txn: + cache_key = self._cache_key(argv_ref, txn) + return remote_ops.delete_cache_ref(cache_key) + except Exception as e: + raise DmlRepoError(f"Failed to delete cache entry: {e}") from e + + def list(self, limit: Optional[int] = None) -> Iterator[tuple[str, Ref]]: + """List cache entries as (cache_key, result_ref) pairs.""" + try: + remote_ops = self._require_remote_context() + refs = remote_ops.list_cache_refs(limit=limit) + with self._tx(readonly=False) as txn: + for cache_key, target in refs: + dag_ref = remote_ops.load_ptr_in_txn(target, txn, expected_root_ns="dag") + yield cache_key, dag_ref + except Exception as e: + raise DmlRepoError(f"Failed to list cache entries: {e}") from e + + def clear(self) -> int: + """Delete all cache entries, returning the number removed.""" + try: + remote_ops = self._require_remote_context() + removed = 0 + for cache_key, _target in remote_ops.list_cache_refs(): + if remote_ops.delete_cache_ref(cache_key): + removed += 1 + return removed + except Exception as e: + raise DmlRepoError(f"Failed to clear cache entries: {e}") from e diff --git a/src/daggerml/_internal/ops/commit.py b/src/daggerml/_internal/ops/commit.py new file mode 100644 index 0000000..4c802ce --- /dev/null +++ b/src/daggerml/_internal/ops/commit.py @@ -0,0 +1,484 @@ +"""Git-like commit operations for version control functionality. + +This module provides CommitOps, a class for managing git-like commits with +version control operations like merging, rebasing, and commit history traversal. +It handles commit creation, tree management, and DAG operations within commits. + +Public API: + CommitOps - Git-like commit operations +""" + +from dataclasses import dataclass +from typing import Iterator, Optional + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import Commit, DmlRepoError, Tree +from daggerml._internal.util import now + + +@dataclass +class CommitOps(BaseOps): + """Git-like commit operations for version control. + + This class provides version control functionality including commit history + traversal, merging, rebasing, and DAG management within commits. It extends + BaseOps to provide commit-specific operations. + """ + + def _topo_sort(self, *xs): + """Topologically sort commits by ancestry. + + Parameters + ---------- + *xs : Ref + Commit references to sort. + + Returns + ------- + list[Ref] + Commits in topological order (ancestors before descendants). + """ + xs = list(xs) + result = [] + while len(xs): + x = xs.pop(0) + with self._tx(readonly=True) as txn: + if x is not None and txn.get(x) and x not in result: + result.append(x) + xs = txn.get(x).parents + xs + return result + + def _merge_base(self, a, b): + """Find the common ancestor of two commits. + + Parameters + ---------- + a : Ref + First commit reference. + b : Ref + Second commit reference. + + Returns + ------- + Ref + The merge base (common ancestor) commit. + """ + while True: + aa = self._topo_sort(a) + ab = self._topo_sort(b) + if set(aa).issubset(ab) or len(set(aa).intersection(ab)) == 0: + return a + if set(ab).issubset(aa): + return b + with self._tx(readonly=True) as txn: + pivot = txn.get(max(set(aa).difference(ab), key=aa.index)) + assert len(pivot.parents), "no merge base found" + if len(pivot.parents) == 1: + return pivot.parents[0] + a, b = pivot.parents + + def _diff(self, t1: Ref, t2: Ref, txn) -> dict: + """Calculate diff between two trees. + + Parameters + ---------- + t1 : Ref + First tree reference. + t2 : Ref + Second tree reference. + txn : TxnContext + Transaction context to use. + + Returns + ------- + dict + Dictionary with 'add' and 'rem' keys containing DAG changes. + """ + d1 = txn.get(t1).dags + d2 = txn.get(t2).dags + result = {"add": {}, "rem": {}} + for k in set(d1.keys()).union(d2.keys()): + if k not in d2: + result["rem"][k] = d1[k] + elif k not in d1: + result["add"][k] = d2[k] + elif d1[k] != d2[k]: + result["rem"][k] = d1[k] + result["add"][k] = d2[k] + return result + + def _patch(self, tree: Ref, *diffs, txn) -> Ref: + """Apply diffs to a tree. + + Parameters + ---------- + tree : Ref + Tree reference to patch. + *diffs : dict + Diff dictionaries to apply. + txn : TxnContext + Transaction context to use. + + Returns + ------- + Ref + Reference to the patched tree. + """ + tree_obj: Tree = txn.get(tree) + dags = dict(tree_obj.dags) + for diff in diffs: + for k, _v in diff["rem"].items(): + dags.pop(k, None) + for k, v in diff["add"].items(): + dags[k] = v + return txn.put(Tree(dags)) + + def _is_ancestor_in_txn(self, ancestor: Ref, descendant: Ref, txn) -> bool: + stack = [descendant] + seen = set() + while stack: + current = stack.pop() + if current == ancestor: + return True + if current in seen: + continue + seen.add(current) + stack.extend(txn.get(current).parents) + return False + + def is_ancestor(self, ancestor: Ref, descendant: Ref) -> bool: + with self._tx(readonly=True) as txn: + return self._is_ancestor_in_txn(ancestor, descendant, txn) + + def list(self, head: Ref, limit: Optional[int] = None) -> Iterator[Ref]: + """Get commit history starting from head. + + Walks the commit history following parent references from the given + branch or commit tip. Yields commit references in reverse chronological order + (newest to oldest). + + Parameters + ---------- + head : Ref + Starting commit or branch reference. + limit : Optional[int] + Maximum number of commits to return. If None, returns all. + + Yields + ------ + Ref + Commit references in reverse chronological order. + + Raises + ------ + DmlRepoError + If the starting commit doesn't exist or history traversal fails. + """ + count = 0 + try: + current = head + while current and (limit is None or count < limit): + if current.ns() != "commit": + raise DmlRepoError(f"Expected commit reference, got: {current}") + yield current + count += 1 + # Get the commit object to find its parent + with self._tx(readonly=True) as txn: + commit = txn.get(current) + # Move to parent commit (take first parent if multiple) + if commit.parents: + current = commit.parents[0] + else: + # Reached initial commit with no parents + break + except Exception as e: + raise DmlRepoError(f"Failed to list commits: {e}") from e + + def merge(self, commit1, commit2, user: str) -> Ref: + """Merge two commits. + + Parameters + ---------- + commit1 : Ref + First commit reference. + commit2 : Ref + Second commit reference. + user : str + Username for commit authorship. + + Returns + ------- + Ref + Reference to the merge commit. + """ + c0 = self._merge_base(commit1, commit2) + with self._tx(readonly=True) as txn: + base_commit: Commit = txn.get(c0) + c1_obj: Commit = txn.get(commit1) + c2_obj: Commit = txn.get(commit2) + + def merge_trees(base, a, b, txn): + diff_a = self._diff(base, a, txn) + diff_b = self._diff(base, b, txn) + conflicts = [] + for name in set(diff_a["add"].keys()).intersection(diff_b["add"].keys()): + if diff_a["add"][name] != diff_b["add"][name]: + conflicts.append(name) + for name in set(diff_a["rem"].keys()).intersection(diff_b["rem"].keys()): + if diff_a["rem"][name] != diff_b["rem"][name]: + conflicts.append(name) + if conflicts: + raise DmlRepoError(f"Merge conflicts: {sorted(conflicts)}") + # Apply both diffs + return self._patch(base, diff_a, diff_b, txn=txn) + + with self._tx(readonly=False) as txn: + merged_tree = merge_trees(base_commit.tree, c1_obj.tree, c2_obj.tree, txn) + return txn.put( + Commit( + parents=[commit1, commit2], + tree=merged_tree, + author=user, + message=f"Merge {commit1.id()[:8]} into {commit2.id()[:8]}", + ) + ) + + def merge_into_head(self, branch: str, other: Ref, user: str) -> Ref: + hops = HeadOps(_db=self._db) + fast_forward = False + with self._tx(readonly=False) as txn: + current = hops.get_branch_commit(branch) + if self._is_ancestor_in_txn(current, other, txn): + fast_forward = True + if self._is_ancestor_in_txn(other, current, txn): + return current + if fast_forward: + hops.update_branch_commit(branch, current, other) + return other + merged = self.merge(current, other, user) + hops.update_branch_commit(branch, current, merged) + return merged + + def revert(self, branch: str, commit: Ref, user: str) -> Ref: + if commit.ns() != "commit": + raise DmlRepoError("Revert expects head and commit refs") + hops = HeadOps(_db=self._db) + with self._tx(readonly=False) as txn: + current_head = hops.get_branch_commit(branch) + target = txn.get(commit) + if len(target.parents) != 1: + raise DmlRepoError("Can only revert commits with exactly one parent") + before_tree = txn.get(txn.get(target.parents[0]).tree) + after_tree = txn.get(target.tree) + current_tree = txn.get(txn.get(current_head).tree) + dags = dict(current_tree.dags) + conflicts = [] + for name in set(before_tree.dags) | set(after_tree.dags): + before_ref = before_tree.dags.get(name) + after_ref = after_tree.dags.get(name) + if before_ref == after_ref: + continue + if dags.get(name) != after_ref: + conflicts.append(name) + continue + if before_ref is None: + dags.pop(name, None) + else: + dags[name] = before_ref + if conflicts: + raise DmlRepoError(f"Revert conflicts: {sorted(set(conflicts))}") + new_tree = txn.put(Tree(dags=dags)) + new_commit = txn.put( + Commit(parents=[current_head], tree=new_tree, author=user, message=f"Revert {commit.id()[:8]}") + ) + hops.update_branch_commit(branch, current_head, new_commit) + return new_commit + + def checkout_dag( + self, + branch: str, + source_commit: Ref, + source_name: str, + *, + target_name: str | None = None, + replace: bool = False, + user: str, + ) -> Ref: + hops = HeadOps(_db=self._db) + target_name = target_name or source_name + with self._tx(readonly=False) as txn: + current_commit_ref = hops.get_branch_commit(branch) + current_commit = txn.get(current_commit_ref) + source_tree = txn.get(txn.get(source_commit).tree) + if source_name not in source_tree.dags: + raise DmlRepoError(f"DAG '{source_name}' not found in source commit") + dag_ref = source_tree.dags[source_name] + current_tree = txn.get(current_commit.tree) + if target_name in current_tree.dags and current_tree.dags[target_name] != dag_ref and not replace: + raise DmlRepoError(f"DAG '{target_name}' already exists; use --replace") + dags = dict(current_tree.dags) + dags[target_name] = dag_ref + new_tree = txn.put(Tree(dags=dags)) + new_commit = txn.put( + Commit( + parents=[current_commit_ref], + tree=new_tree, + author=user, + message=f"Checkout DAG '{source_name}' from {source_commit.id()[:8]}", + dag=dag_ref, + ) + ) + hops.update_branch_commit(branch, current_commit_ref, new_commit) + return new_commit + + def rebase(self, source, target, user: str): + """Rebase source commit onto target. + + Parameters + ---------- + source : Ref + Commit to rebase. + target : Ref + Target commit to rebase onto. + user : str + Username for commit authorship. + + Returns + ------- + Ref + Reference to the rebased commit. + """ + with self._tx(readonly=False) as txn: + c0 = self._merge_base(source, target) + + def replay(commit_ref, target, txn): + commit: Commit = txn.get(commit_ref) + if len(commit.parents) != 1: + raise DmlRepoError("Can only rebase linear history") + old_parent = commit.parents[0] + new_tree = self._patch( + txn.get(target).tree, + self._diff(txn.get(old_parent).tree, commit.tree, txn), + txn=txn, + ) + return txn.put( + Commit( + parents=[target], + tree=new_tree, + author=user, + message=commit.message, + dag=commit.dag, + created=commit.created, + modified=now(), + ) + ) + + return target if c0 == source else source if c0 == target else replay(source, target, txn) + + def get_dag(self, commit: Ref, name: str) -> Optional[Ref]: + """Get DAG from commit's tree by name. + + Looks up a named DAG in the commit's tree structure. + + Parameters + ---------- + commit : Ref + Commit to search in. + name : str + Name of the DAG to find. + + Returns + ------- + Optional[Ref] + Reference to the DAG if found, None otherwise. + + Raises + ------ + DmlRepoError + If commit doesn't exist or tree lookup fails. + """ + try: + with self._tx(readonly=True) as txn: + commit_obj = txn.get(commit) + if not isinstance(commit_obj, Commit): + raise DmlRepoError(f"Expected Commit at {commit}, got {type(commit_obj)}") + with self._tx(readonly=True) as txn: + tree = txn.get(commit_obj.tree) + if not isinstance(tree, Tree): + raise DmlRepoError(f"Expected Tree at {commit_obj.tree}, got {type(tree)}") + return tree.dags.get(name) + except Exception as e: + raise DmlRepoError(f"Failed to get DAG '{name}' from commit: {e}") from e + + def describe(self, commit: Ref) -> dict: + """Describe a commit by reference.""" + if commit.ns() != "commit": + raise DmlRepoError(f"Expected commit reference, got: {commit}") + with self._tx(readonly=True) as txn: + commit_obj: Commit = txn.get(commit) + if not isinstance(commit_obj, Commit): + raise DmlRepoError(f"Expected Commit at {commit}, got {type(commit_obj)}") + return { + "id": commit.id(), + "parents": commit_obj.parents, + "tree": commit_obj.tree, + "author": commit_obj.author, + "message": commit_obj.message, + "dag": commit_obj.dag, + "created": commit_obj.created, + "modified": commit_obj.modified, + } + + # FIXME: Move to HeadOps.delete_dag. + def delete_dag(self, name: str, branch: str | None, user: str) -> Self: + """Remove DAG from head's tree and create new commit. + + Creates a new commit with the specified DAG removed from the tree. + Uses the current branch commit as the parent of the new commit. + + Parameters + ---------- + name : str + Name of the DAG to remove. + head : Ref + Branch name to modify. + user : str + Username for commit authorship. + + Returns + ------- + Ref + Reference to the new commit with DAG removed. + + Raises + ------ + DmlRepoError + If the branch commit/DAG doesn't exist or deletion fails. + """ + try: + hops = HeadOps(_db=self._db) + branch_name = branch or hops.require_attached_head_branch() + with self._tx(readonly=False) as txn: + current_commit_ref = hops.get_branch_commit(branch_name) + ctx = txn.get_commit_ctx(current_commit_ref) + # Check if DAG exists + if name not in ctx.tree.dags: + raise DmlRepoError(f"DAG '{name}' not found in branch commit tree") + # Create new tree without the specified DAG + ctx.tree.dags = {k: v for k, v in ctx.tree.dags.items() if k != name} + ctx.commit.tree = txn.put(ctx.tree) + ctx.commit.author = user + ctx.commit.parents = [current_commit_ref] + ctx.commit.message = f"Delete DAG '{name}'" + new_commit_ref = txn.put(ctx.commit) + hops.update_branch_commit(branch_name, current_commit_ref, new_commit_ref) + return self + except Exception as e: + raise DmlRepoError(f"Failed to delete DAG '{name}': {e}") from e diff --git a/src/daggerml/_internal/ops/config.py b/src/daggerml/_internal/ops/config.py new file mode 100644 index 0000000..707d647 --- /dev/null +++ b/src/daggerml/_internal/ops/config.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import json +import tomllib +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +from daggerml._internal.config import _validate_ref_name, validate_dml_project_uri, validate_remote_root +from daggerml._internal.types import DmlRepoError + +SCOPE_GLOBAL = "global" +SCOPE_LOCAL = "local" + +GLOBAL_KEYS = {"user", "default_branch", "remote.fetch_workers"} +LOCAL_KEYS = {"remote.project", "remote.root", "remote.fetch_workers"} +ALL_KEYS = GLOBAL_KEYS | LOCAL_KEYS + + +def _read_toml(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + return tomllib.loads(path.read_text()) + + +def _toml_value(value: Any) -> str: + if isinstance(value, str): + return json.dumps(value) + if isinstance(value, int): + return str(value) + if isinstance(value, list): + return f"[{', '.join(_toml_value(item) for item in value)}]" + raise DmlRepoError(f"Unsupported config value type: {type(value).__name__}") + + +def _write_toml(path: Path, data: dict[str, Any]) -> None: + lines: list[str] = [] + for section in ("project", "remote", "user", "defaults"): + section_data = data.get(section) + if not isinstance(section_data, dict) or not section_data: + continue + if lines: + lines.append("") + lines.append(f"[{section}]") + for key in sorted(section_data): + lines.append(f"{key} = {_toml_value(section_data[key])}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + ("\n" if lines else "")) + + +def _set_nested(data: dict[str, Any], section: str, key: str, value: Any) -> None: + section_data = data.setdefault(section, {}) + if not isinstance(section_data, dict): + section_data = {} + data[section] = section_data + section_data[key] = value + + +@dataclass +class ConfigOps: + project_home: str | None + config_home: str + + def _path_for_scope(self, scope: Literal["global", "local"]) -> Path: + if scope == SCOPE_GLOBAL: + return Path(self.config_home) / "config.toml" + if scope == SCOPE_LOCAL: + if not self.project_home: + raise DmlRepoError("Local config requires project.home (--project-home or DML_PROJECT_HOME)") + return Path(self.project_home) / ".dml" / "config.toml" + raise DmlRepoError(f"Unknown config scope: {scope}") + + def _validate_scope_key(self, scope: Literal["global", "local"], key: str) -> None: + if key not in ALL_KEYS: + raise DmlRepoError(f"Unsupported config key: {key}") + if scope == SCOPE_GLOBAL and key not in GLOBAL_KEYS: + raise DmlRepoError(f"Config key {key!r} is not valid in global scope") + if scope == SCOPE_LOCAL and key not in LOCAL_KEYS: + raise DmlRepoError(f"Config key {key!r} is not valid in local scope") + + def get(self, key: str, *, scope: Literal["global", "local"]) -> str | list[str] | None: + self._validate_scope_key(scope, key) + data = _read_toml(self._path_for_scope(scope)) + if key == "remote.project": + value = (data.get("remote") or {}).get("project") + return str(value) if value else None + if key == "remote.root": + value = (data.get("remote") or {}).get("root") + return str(value) if value else None + if key == "remote.fetch_workers": + value = (data.get("remote") or {}).get("fetch_workers") + return str(value) if value is not None else None + if key == "user": + value = (data.get("user") or {}).get("name") + return str(value) if value else None + if key == "default_branch": + value = (data.get("defaults") or {}).get("branch") + return str(value) if value else None + raise DmlRepoError(f"Unsupported config key: {key}") + + def set(self, key: str, value: str, *, scope: Literal["global", "local"]) -> str | list[str]: + self._validate_scope_key(scope, key) + if key == "remote.project": + value = validate_dml_project_uri(str(value)) + elif key == "remote.root": + value = validate_remote_root(str(value)) + elif key == "default_branch": + _validate_ref_name("branch", str(value)) + elif key == "remote.fetch_workers": + try: + workers = int(str(value), 10) + except ValueError as exc: + raise DmlRepoError("remote.fetch_workers must be a positive integer") from exc + if workers <= 0: + raise DmlRepoError("remote.fetch_workers must be a positive integer") + value = str(workers) + elif key == "user" and not str(value): + raise DmlRepoError("user must be a non-empty string") + path = self._path_for_scope(scope) + data = _read_toml(path) + if key == "remote.project": + _set_nested(data, "remote", "project", value) + elif key == "remote.root": + _set_nested(data, "remote", "root", value) + elif key == "remote.fetch_workers": + _set_nested(data, "remote", "fetch_workers", int(str(value))) + elif key == "user": + _set_nested(data, "user", "name", value) + elif key == "default_branch": + _set_nested(data, "defaults", "branch", value) + else: + raise DmlRepoError(f"Unsupported config key: {key}") + _write_toml(path, data) + return value diff --git a/src/daggerml/_internal/ops/dag.py b/src/daggerml/_internal/ops/dag.py new file mode 100644 index 0000000..b14ce34 --- /dev/null +++ b/src/daggerml/_internal/ops/dag.py @@ -0,0 +1,144 @@ +"""DAG operations for managing directed acyclic graphs. + +Public API: + DagOps - Class for DAG-related operations +""" + +from dataclasses import dataclass +from typing import Any + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.types import Dag, DmlRepoError, KwargvNode + + +@dataclass +class DagOps(BaseOps): + """Operations for listing and describing DAGs stored in the repository.""" + + @staticmethod + def _kwargv_ref_from_nodes(dag: Dag, txn) -> Ref | None: + matches = [] + for node_ref in dag.nodes: + node = txn.get(node_ref) + if isinstance(node, KwargvNode): + matches.append(node_ref) + if len(matches) > 1: + raise DmlRepoError("DAG has multiple kwargv nodes") + return matches[0] if matches else None + + def list(self) -> list[dict[str, Any]]: + """List all DAGs + + Returns a list of dicts describing each DAG (id, names, result, argv). + """ + with self._tx(readonly=True) as txn: + out = [] + for dag_ref in txn.iter("dag"): + dag = txn.get(dag_ref) + argv_ref = dag.argv if dag is not None else None + kwargv_ref = self._kwargv_ref_from_nodes(dag, txn) if dag is not None else None + out.append( + { + "name": dag_ref.id(), + "id": dag_ref.id(), + "nodes": dag.nodes, + "names": dag.names, + "result": dag.result, + "argv": argv_ref, + "kwargv": kwargv_ref, + } + ) + return out + + def describe(self, dag_ref: Ref) -> dict: + """Get DAG attributes, topology, and id as a dict.""" + if dag_ref.ns() != "dag": + raise DmlRepoError(f"Expected dag ref, got: {dag_ref}") + with self._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + kwargv_ref = self._kwargv_ref_from_nodes(dag, txn) + return { + "id": dag_ref.id(), + "nodes": dag.nodes, + "names": dag.names, + "result": dag.result, + "argv": dag.argv, + "kwargv": kwargv_ref, + } + + def get_node(self, dag_ref: Ref, name: str) -> Ref: + """Return the Ref of a named node in a finished DAG. + + Parameters + ---------- + dag_ref : Ref + Reference to the DAG to query (must be namespace 'dag'). + name : str + The name of the node to look up in the DAG's `names` mapping. + + Returns + ------- + Ref + Reference to the node associated with `name`. + + Raises + ------ + DmlRepoError + If `dag_ref` is not a dag Ref, the DAG is not present or not + finished (has no result), or the named node does not exist. + """ + if dag_ref.ns() != "dag": + raise DmlRepoError(f"Expected dag ref, got: {dag_ref}") + with self._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + if dag is None: + raise DmlRepoError(f"Object not found: {dag_ref}") + # Ensure DAG is finished before allowing named node lookup + if not dag.is_finished(): + raise DmlRepoError("Cannot get node from unfinished DAG") + if name not in dag.names: + raise DmlRepoError(f"Node '{name}' not found in DAG") + return dag.names[name] + + def get_argv(self, dag_ref: Ref) -> Ref: + """Return the argv Ref for a DAG. + + Parameters + ---------- + dag_ref : Ref + Reference to the DAG to query (must be namespace 'dag'). + + Returns + ------- + Ref + Reference to the argv node for the DAG. + + Raises + ------ + DmlRepoError + If `dag_ref` is not a dag Ref, the DAG is not present, or the DAG + has no argv node. + """ + if dag_ref.ns() != "dag": + raise DmlRepoError(f"Expected dag ref, got: {dag_ref}") + with self._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + if dag is None: + raise DmlRepoError(f"Object not found: {dag_ref}") + if dag.argv is None: + raise DmlRepoError("DAG has no argv node") + return dag.argv + + def get_kwargv(self, dag_ref: Ref) -> Ref: + """Return the kwargv Ref for a DAG.""" + if dag_ref.ns() != "dag": + raise DmlRepoError(f"Expected dag ref, got: {dag_ref}") + with self._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + if dag is None: + raise DmlRepoError(f"Object not found: {dag_ref}") + kwargv_ref = self._kwargv_ref_from_nodes(dag, txn) + if kwargv_ref is None: + raise DmlRepoError("DAG has no kwargv node") + return kwargv_ref diff --git a/src/daggerml/_internal/ops/gc.py b/src/daggerml/_internal/ops/gc.py new file mode 100644 index 0000000..c4a2e71 --- /dev/null +++ b/src/daggerml/_internal/ops/gc.py @@ -0,0 +1,73 @@ +"""Garbage collection operations for repository maintenance. + +Public API: + GcOps - Class for garbage collection operations +""" + +import logging +from collections import defaultdict +from dataclasses import dataclass + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import DmlRepoError + +logger = logging.getLogger(__name__) + + +@dataclass +class GcOps(BaseOps): + """Garbage collection operations for cleaning up orphaned objects.""" + + def gc(self) -> dict[str, int]: + """Perform garbage collection. Remove unreachable objects. + Returns a dict mapping object types to count removed. + """ + try: + orphans = self.list_orphans() + stats = defaultdict(int) + # perform deletions in a write transaction + with self._tx(readonly=False) as txn: + for ref in orphans: + try: + if txn.exists(ref): + txn.delete(ref) + stats[ref.ns()] += 1 + except Exception: + logger.warning(f"Failed to delete orphaned object: {ref}", exc_info=True) + return dict(stats) + except Exception as e: + raise DmlRepoError(f"GC failed: {e}") from e + + def list_orphans(self, heads: list[Ref] | None = None) -> list[Ref]: + """Identify orphaned objects (not reachable from provided heads). + + Parameters + ---------- + heads : list[Ref] or None, optional + Traversal roots to start reachability analysis from. If ``None``, + all repository `head` and `index` objects are used. If an empty list + is provided, the underlying database computes orphans across the + entire database. + + Returns + ------- + list[Ref] + A list of references that are not reachable from the provided heads. + + Raises + ------ + DmlRepoError + If the operation fails. + """ + try: + with self._tx(readonly=True) as txn: + if heads is None: + heads = HeadOps(_db=self._db).list_pointer_roots() + if not heads: + logger.warning("Listing orphans with no heads; this will clear the repo.") + # Call the raw DB transaction helper directly (no BaseOps edits required) + return list(txn.txn.list_orphans(heads)) + except Exception as e: + raise DmlRepoError(f"Failed to list orphans: {e}") from e diff --git a/src/daggerml/_internal/ops/head.py b/src/daggerml/_internal/ops/head.py new file mode 100644 index 0000000..e15f047 --- /dev/null +++ b/src/daggerml/_internal/ops/head.py @@ -0,0 +1,327 @@ +"""Branch and index pointer operations.""" + +import hashlib +import os +import re +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.revision_uri import parse_revision_uri, validate_ref_name, validate_segment +from daggerml._internal.types import Commit, DmlPointerConflictError, DmlRepoError, Tree +from daggerml._internal.util import uuid7 + +_IDENTIFIER_RE = re.compile(r"^[A-Za-z0-9\-\*\|_]+$") +_HEAD_ATTACHED_PREFIX = "ref: refs/local/heads/" + + +@dataclass(frozen=True) +class HeadState: + mode: str + commit: Ref + branch: str | None = None + + +@dataclass +class HeadOps(BaseOps): + def list_branches(self) -> list[str]: + return self._list_ref_names(self._local_heads_dir()) + + def list_indexes(self) -> list[str]: + return self._list_ref_names(self._local_indexes_dir()) + + def create_branch(self, branch_name: str, from_commit: Ref | None = None, *, txn=None) -> str: + if txn is not None: + target_commit_ref = from_commit + if target_commit_ref is None: + raise DmlRepoError("Bootstrap branch creation does not support caller-owned transactions") + self._create_pointer(self._branch_path(branch_name), target_commit_ref) + return branch_name + target_commit_ref = from_commit + if target_commit_ref is None: + with self._tx(readonly=False) as owned_txn: + tree_ref = owned_txn.put(Tree(dags={})) + target_commit_ref = owned_txn.put( + Commit(tree=tree_ref, parents=[], author="dml", message="Initial commit") + ) + assert target_commit_ref is not None + self._create_pointer(self._branch_path(branch_name), target_commit_ref) + return branch_name + + def delete_branch(self, branch_name: str) -> None: + self._delete_pointer(self._branch_path(branch_name)) + + def get_branch_commit(self, branch_name: str | None) -> Ref: + branch_name = branch_name or self.require_attached_head_branch() + return self._get_pointer_commit(self._branch_path(branch_name)) + + def update_branch_commit(self, branch_name: str, old_commit: Ref, new_commit: Ref) -> Ref: + return self._update_pointer_commit(self._branch_path(branch_name), old_commit, new_commit) + + def create_index(self, commit_ref: Ref, index_id: str | None = None) -> str: + _id = index_id or str(uuid7()) + index_path = self._index_path(_id) + if index_path.exists(): + if index_id is None: + raise DmlRepoError(f"Index already exists with id: {index_id}") + index_path.unlink() + self._create_pointer(index_path, commit_ref) + return _id + + def delete_index(self, index_id: str) -> None: + self._delete_pointer(self._index_path(index_id)) + return None + + def get_index_commit(self, index_id: str) -> Ref: + return self._get_pointer_commit(self._index_path(index_id)) + + def list_pointer_roots(self) -> list[Ref]: + roots = [ + *[self._get_pointer_commit(self._local_branch_path(branch_name)) for branch_name in self.list_branches()], + *[self._get_pointer_commit(self._index_path(index_id)) for index_id in self.list_indexes()], + ] + try: + return [self.resolve_head_commit(), *roots] + except DmlRepoError: + return roots + + def update_index_commit(self, index_id: str, old_commit: Ref, new_commit: Ref) -> Ref: + return self._update_pointer_commit(self._index_path(index_id), old_commit, new_commit) + + def get_head_state(self) -> HeadState: + payload = self._read_head_payload() + if payload.startswith(_HEAD_ATTACHED_PREFIX): + branch = self._validate_branch_name(payload[len(_HEAD_ATTACHED_PREFIX) :]) + commit = self.get_branch_commit(branch) + return HeadState(mode="attached", branch=branch, commit=commit) + if payload.startswith("commit:"): + commit = Ref(payload) + if commit.ns() != "commit": + raise DmlRepoError(f"Invalid HEAD payload in {self._head_path()}") + return HeadState(mode="detached", branch=None, commit=commit) + raise DmlRepoError(f"Invalid HEAD payload in {self._head_path()}") + + def resolve_head_commit(self) -> Ref: + return self.get_head_state().commit + + def get_attached_head_branch(self) -> str | None: + state = self.get_head_state() + return state.branch + + def require_attached_head_branch(self) -> str: + branch = self.get_attached_head_branch() + if branch is None: + raise DmlRepoError("Current checkout is detached; attach HEAD or pass an explicit branch") + return branch + + def write_attached_head(self, branch_name: str) -> str: + branch = self._validate_branch_name(branch_name) + self._write_head_payload(f"{_HEAD_ATTACHED_PREFIX}{branch}") + return branch + + def write_detached_head(self, commit_ref: Ref) -> Ref: + if commit_ref.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {commit_ref}") + self._write_head_payload(commit_ref.to) + return commit_ref + + def _branch_path(self, branch_name: str) -> Path: + if branch_name.startswith("dml://"): + parsed = parse_revision_uri(branch_name, require_identifier=True) + return self._remote_ref_path(parsed.owner, parsed.project, parsed.branch, parsed.tag) + if "://" in branch_name: + return self._external_tracking_path(branch_name) + return self._local_branch_path(branch_name) + + def _local_branch_path(self, branch_name: str) -> Path: + return self._local_heads_dir() / self._validate_branch_name(branch_name) + + def _index_path(self, index_id: str) -> Path: + return self._local_indexes_dir() / self._validate_index_id(index_id) + + def _remote_ref_path(self, owner: str, project: str, branch: str | None, tag: str | None) -> Path: + remote_root = self._refs_root() / "remote" + owner_name = self._validate_segment("owner", owner) + project_name = self._validate_segment("project", project) + if branch is not None: + return remote_root / owner_name / project_name / "heads" / self._validate_branch_name(branch) + if tag is not None: + return remote_root / owner_name / project_name / "tags" / self._validate_branch_name(tag) + raise DmlRepoError("Remote tracking refs require a branch or tag") + + def _external_tracking_path(self, branch_name: str) -> Path: + digest = hashlib.sha256(branch_name.encode("utf-8")).hexdigest() + return self._refs_root() / "remote" / "_external" / "heads" / digest + + def _project_home(self) -> Path: + db_path_value = getattr(self._db, "path", None) + if not db_path_value or not isinstance(db_path_value, (str, os.PathLike)): + raise DmlRepoError("Cannot resolve project home from database path") + db_path = Path(db_path_value).resolve() + if db_path.name == "db" and db_path.parent.name == ".dml": + return db_path.parent.parent + raise DmlRepoError(f"Cannot resolve project home from database path: {db_path}") + + def _refs_root(self) -> Path: + return self._project_home() / ".dml" / "refs" + + def _head_path(self) -> Path: + return self._project_home() / ".dml" / "HEAD" + + def _local_heads_dir(self) -> Path: + return self._refs_root() / "local" / "heads" + + def _local_indexes_dir(self) -> Path: + return self._refs_root() / "local" / "indexes" + + @staticmethod + def _validate_identifier(label: str, value: str) -> str: + if not isinstance(value, str) or not _IDENTIFIER_RE.fullmatch(value): + raise DmlRepoError(f"Invalid {label}: {value!r}") + return value + + @staticmethod + def _validate_segment(label: str, value: str) -> str: + try: + return validate_segment(label, value) + except ValueError as exc: + raise DmlRepoError(str(exc)) from exc + + @staticmethod + def _validate_branch_name(value: str) -> str: + try: + return validate_ref_name("branch", value) + except ValueError as exc: + raise DmlRepoError(str(exc)) from exc + + @staticmethod + def _validate_index_id(index_id: str) -> str: + if not isinstance(index_id, str) or not index_id or "\\" in index_id: + raise DmlRepoError(f"Invalid index id: {index_id!r}") + if index_id in {".", ".."}: + raise DmlRepoError(f"Invalid index id: {index_id!r}") + if "/" not in index_id: + return index_id + segments = index_id.split("/") + if len(segments) == 2 and segments[0] == ".cancelled" and segments[1] not in {"", ".", ".."}: + return index_id + raise DmlRepoError(f"Invalid index id: {index_id!r}") + return index_id + + @staticmethod + def _list_ref_names(ref_dir: Path) -> list[str]: + if not ref_dir.exists(): + return [] + return sorted( + str(entry.relative_to(ref_dir)) + for entry in ref_dir.rglob("*") + if entry.is_file() and not entry.name.endswith(".lock") + ) + + def _read_head_payload(self) -> str: + head_path = self._head_path() + if not head_path.exists(): + raise DmlRepoError(f"Pointer does not exist: {head_path}") + payload = head_path.read_text(encoding="utf-8").strip() + if not payload: + raise DmlRepoError(f"Invalid HEAD payload in {head_path}") + return payload + + def _write_head_payload(self, payload: str) -> None: + head_path = self._head_path() + head_path.parent.mkdir(parents=True, exist_ok=True) + with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=head_path.parent, delete=False) as tmp: + tmp.write(payload) + tmp_path = Path(tmp.name) + os.replace(tmp_path, head_path) + + def _create_pointer(self, pointer_path: Path, commit_ref: Ref) -> None: + self._mutate_pointer(pointer_path, expected_old_commit=None, new_commit=commit_ref, create_only=True) + + def _delete_pointer(self, pointer_path: Path) -> None: + with self._pointer_lock(pointer_path): + if not pointer_path.exists(): + raise DmlRepoError(f"Pointer does not exist: {pointer_path}") + pointer_path.unlink() + + def _get_pointer_commit(self, pointer_path: Path) -> Ref: + return self._read_pointer_commit(pointer_path) + + def _update_pointer_commit(self, pointer_path: Path, old_commit: Ref, new_commit: Ref) -> Ref: + if old_commit.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {old_commit}") + if new_commit.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {new_commit}") + self._mutate_pointer(pointer_path, expected_old_commit=old_commit, new_commit=new_commit) + return new_commit + + def _read_pointer_commit(self, pointer_path: Path) -> Ref: + if not pointer_path.exists(): + raise DmlRepoError(f"Pointer does not exist: {pointer_path}") + commit_id = pointer_path.read_text(encoding="utf-8").strip() + if not commit_id or "/" in commit_id or "\\" in commit_id or ":" in commit_id: + raise DmlRepoError(f"Invalid pointer payload in {pointer_path}") + return Ref(f"commit:{commit_id}") + + def _mutate_pointer( + self, + pointer_path: Path, + *, + expected_old_commit: Ref | None, + new_commit: Ref, + create_only: bool = False, + ) -> None: + with self._pointer_lock(pointer_path): + if create_only: + if pointer_path.exists(): + raise DmlRepoError(f"Branch already exists: {pointer_path.name}") + else: + current_commit = self._read_pointer_commit(pointer_path) + if expected_old_commit is not None and current_commit != expected_old_commit: + msg = f"Stale pointer update rejected for {pointer_path.name}" + raise DmlPointerConflictError(msg, current_commit=current_commit) + self._write_pointer_commit(pointer_path, new_commit) + + def _write_pointer_commit(self, pointer_path: Path, commit_ref: Ref) -> None: + if commit_ref.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {commit_ref}") + pointer_path.parent.mkdir(parents=True, exist_ok=True) + with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=pointer_path.parent, delete=False) as tmp: + tmp.write(commit_ref.id()) + tmp_path = Path(tmp.name) + os.replace(tmp_path, pointer_path) + + def _pointer_lock(self, pointer_path: Path): + pointer_path.parent.mkdir(parents=True, exist_ok=True) + lock_path = pointer_path.parent / f"{pointer_path.name}.lock" + return _FileLock(lock_path) + + @staticmethod + def _require_commit(commit_ref: Ref, txn) -> None: + if commit_ref.ns() != "commit": + raise DmlRepoError(f"Expected commit ref, got: {commit_ref}") + if not txn.exists(commit_ref): + raise DmlRepoError(f"Commit does not exist: {commit_ref}") + + +class _FileLock: + def __init__(self, path: Path): + self._path = path + self._fh = None + + def __enter__(self): + import fcntl + + self._fh = self._path.open("a+", encoding="utf-8") + fcntl.flock(self._fh.fileno(), fcntl.LOCK_EX) + return self + + def __exit__(self, exc_type, exc, tb): + import fcntl + + assert self._fh is not None + fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN) + self._fh.close() + self._fh = None diff --git a/src/daggerml/_internal/ops/index.py b/src/daggerml/_internal/ops/index.py new file mode 100644 index 0000000..f0a679f --- /dev/null +++ b/src/daggerml/_internal/ops/index.py @@ -0,0 +1,1299 @@ +"""Index operations for managing working state and function execution. + +Public API: + IndexOps - Class for index and execution operations +""" + +from __future__ import annotations + +import json +import os +import shutil +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from subprocess import run +from typing import Any, Mapping, Optional, cast +from urllib.parse import urlparse + +from daggerml._internal._db import Ref +from daggerml._internal.builtins import BUILTIN_FNS +from daggerml._internal.exec_state import ExecutionRecord, ExecutionState, LaunchState +from daggerml._internal.execution_context import get_current_execution_context +from daggerml._internal.ops.base_ops import BaseOps, with_retry +from daggerml._internal.ops.cache import CacheOps +from daggerml._internal.ops.dag import DagOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.ops.remote import RemoteOps +from daggerml._internal.types import ( + ArgvNode, + Commit, + Dag, + Datum, + DictDatum, + DmlPointerConflictError, + DmlRepoError, + Error, + FnNode, + ImportNode, + KwargvNode, + ListDatum, + LiteralNode, + Node, + Runnable, + RunnableDatum, + ScalarDatum, + Tree, + Uri, + require_ref, +) +from daggerml._internal.util import now, unnest, uuid7 + + +@dataclass(frozen=True) +class _PreparedAdapterCall: + argv_ref: Ref + adapter_path: str + cache_key: str + runnable: dict[str, Any] + caller_execution_id: str | None = None + caller_cache_key: str | None = None + + +@dataclass +class IndexOps(BaseOps): + remote_root: str + + def _remote_ops(self): + parsed = urlparse(self.remote_root) + prefix = parsed.path.strip("/") + return RemoteOps(_db=self._db, bucket=parsed.netloc, prefix=f"{prefix}/dml" if prefix else "dml") + + @staticmethod + def _kwargv_ref_from_nodes(dag: Dag, txn) -> Ref | None: + matches = [] + for node_ref in dag.nodes: + node = txn.get(node_ref) + if isinstance(node, KwargvNode): + matches.append(node_ref) + if len(matches) > 1: + raise DmlRepoError("DAG has multiple kwargv nodes") + return matches[0] if matches else None + + def start_fn( + self, + index_id: str, + argv: list[Ref], + kwargv: Optional[dict[str, Ref]] = None, + name: Optional[str] = None, + ) -> Optional[Ref]: + kwargv = kwargv or {} + # Important: if the called function produced a DaggerML Error, we still + # want any DB + pointer updates performed while finishing the call to be + # committed. We therefore capture the error inside the transaction and + # raise it only after the txn scope exits successfully. + with self._tx(readonly=False) as txn: + argv_ref = self._prepare_fn(index_id, argv, kwargv, txn) + dag_ref = self._run_builtin(argv_ref, txn) + if dag_ref is not None: + resolved_dag_ref = dag_ref + else: + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + dag_ref = cops._get(argv_ref, txn) + if dag_ref is not None: + resolved_dag_ref = dag_ref + else: + prepared = self._prepare_adapter_call(index_id, argv_ref, txn) + + if "resolved_dag_ref" in locals(): + return self._finish_fn_result(resolved_dag_ref, argv, name, None, index_id) + argv_ptr = self._remote_ops().put_ref_manifest(prepared.argv_ref) + es = ExecutionState(prepared.cache_key, remote_root=self.remote_root) + + # Step 1: try to acquire the mutex + if not es.lock(): + # Another process is driving this cycle + return None + locked = True + + try: + # Step 2: post-lock cache check + with self._tx(readonly=False) as txn: + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + dag_ref = cops._get(prepared.argv_ref, txn) + if dag_ref is not None: + es.unlock() + locked = False + post_lock_dag_ref = dag_ref + if "post_lock_dag_ref" in locals(): + return self._finish_fn_result(post_lock_dag_ref, argv, name, None, index_id) + + execution_id = es.read_active_execution_id() + execution_record = None + launch_state = None + if execution_id is not None: + execution_record = es.read_execution_record(execution_id) + launch_state = es.read_launch_state(execution_id) + if ( + execution_record is None + or launch_state is None + or execution_record["lifecycle"] in {"succeeded", "failed", "cancel-detached"} + ): + es.delete_active_execution() + execution_id = None + execution_record = None + launch_state = None + if execution_id is None: + assert execution_record is None + execution_id = str(uuid7()) + state = None + else: + assert execution_record is not None + assert launch_state is not None + state = cast(dict[str, Any] | None, launch_state["resume_state"]) + + # Step 3: call adapter (holding the lock) + result = self._call_adapter( + prepared, + argv_ptr, + execution_id=execution_id, + state=state, + execution_status=(cast(str, execution_record["lifecycle"]) if execution_record is not None else None), + cancel_requested_by=( + cast(str | None, execution_record["cancellation_requested_by"]) + if execution_record is not None + else None + ), + ) + + self._record_call_edges(prepared, es, execution_id=execution_id) + + # Step 4: handle adapter result + status = result["status"] + execution_state: ExecutionRecord = { + "execution_id": execution_id, + "cache_key": prepared.cache_key, + "lifecycle": status, + "updated_at": int(time.time()), + "spawned_execution_ids": ( + list(cast(list[str], execution_record.get("spawned_execution_ids", []))) + if execution_record is not None + else [] + ), + "cancellation_requested_by": ( + execution_record["cancellation_requested_by"] if execution_record is not None else None + ), + } + execution_state = es.update_execution_record(execution_state) + if status == "succeeded": + es.delete_active_execution() + es.unlock() + locked = False + with self._tx(readonly=False) as txn: + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + dag_ref = cops._get(prepared.argv_ref, txn) + if dag_ref is not None: + terminal_dag_ref = dag_ref + if "terminal_dag_ref" in locals(): + return self._finish_fn_result(terminal_dag_ref, argv, name, None, index_id) + raise DmlRepoError("Adapter reported success but no cached DAG was published") + elif status == "failed": + self._publish_terminal_state(prepared.argv_ref, result, execution_id=execution_id) + es.delete_active_execution() + es.unlock() + locked = False + with self._tx(readonly=False) as txn: + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + dag_ref = cops._get(prepared.argv_ref, txn) + if dag_ref is not None: + terminal_dag_ref = dag_ref + if "terminal_dag_ref" in locals(): + return self._finish_fn_result(terminal_dag_ref, argv, name, None, index_id) + raise DmlRepoError("Adapter reported failure but no cached failed DAG was published") + elif status == "cancel-detached": + es.delete_active_execution() + es.unlock() + locked = False + return None + else: + if state is None: + launch_state_record: LaunchState = { + "execution_id": execution_id, + "cache_key": prepared.cache_key, + "resume_state": cast(dict[str, Any], result["state"]), + "created_at": int(time.time()), + } + es.update_launch_state(launch_state_record) + if not es.create_active_execution(execution_id): + raise DmlRepoError(f"Active execution already exists for cache key: {prepared.cache_key}") + # running — adapter is still working asynchronously + es.unlock() + locked = False + return None + finally: + if locked: + es.unlock() + + def delete(self, index_id: str) -> None: + """Delete an index object from db.""" + HeadOps(_db=self._db).delete_index(index_id) + + def cancel(self, index_id: str, *, requested_by: str) -> dict[str, Any]: + cancelled_path = self._resolve_index_for_cancellation(index_id) + root_record = self._mark_index_root_cancel_requested(index_id, requested_by=requested_by) + self._drop_live_caller_edges(index_id, cast(list[str], root_record.get("spawned_execution_ids", []))) + graph = self._collect_cancellation_graph(index_id, root_record) + candidate_set = {callee_id for _caller_id, callee_id in graph} + own_executions = set(candidate_set) + return { + "index_id": index_id, + "requested_by": requested_by, + "cancelled_path": cancelled_path, + "graph": graph, + "candidate_set": candidate_set, + "own_executions": own_executions, + } + + @with_retry + def create( + self, + head: Optional[str] = None, + commit: Optional[Ref] = None, + argv_ptr: Optional[str] = None, + index_id: Optional[str] = None, + ) -> str: + """Create a new index object. + + Parameters + ---------- + head : str, optional + Branch name to base the index on. + commit : Ref, optional + Commit reference to base the index on. + argv_ptr : str, optional + Optional remote manifest OID to initialize argv from. + index_id : str, optional + Optional index id to use (defaults to a new UUID). + + Returns + ------- + str + Opaque index id for the newly created index pointer. + """ + modes = [head is not None, commit is not None, argv_ptr is not None] + if sum(modes) != 1: + raise DmlRepoError("Provide exactly one of branch, commit, or argv_ptr.") + kw = {} + if head is not None: + kw["commit"] = HeadOps(_db=self._db).get_branch_commit(head) + elif commit is not None: + kw["commit"] = commit + if argv_ptr is not None: + kw["argv"] = self._remote_ops().load_ptr(argv_ptr, expected_root_ns="node-argv") + with self._tx(readonly=False) as txn: + commit_ref = self._create(**kw, txn=txn) + created_index_id = HeadOps(_db=self._db).create_index(commit_ref, index_id=index_id) + if self.remote_root: + self._ensure_index_execution_root(created_index_id) + return created_index_id + + @with_retry + def get_kwargv(self, index_id: str) -> Ref: + """Return the kwargv node for an index (raises if missing).""" + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(commit_ref) + return DagOps(_db=self._db).get_kwargv(cast(Ref, ctx.commit.dag)) + + @with_retry + def get_argv(self, index_id: str) -> Ref: + """Return the argv node for an index (raises if missing).""" + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(commit_ref) + return DagOps(_db=self._db).get_argv(cast(Ref, ctx.commit.dag)) + + @with_retry + def get_node(self, index_id: str, name: str) -> Ref: + """Return a named node from an index's DAG.""" + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(commit_ref) + # Access the dag's names directly to avoid nested transactions + dag: Dag = txn.get(cast(Ref, ctx.commit.dag)) + if name not in dag.names: + raise DmlRepoError(f"Node '{name}' not found in DAG") + return dag.names[name] + + @with_retry + def describe(self, index_id: str) -> dict[str, Any]: + """Describe the current index state.""" + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(commit_ref) + return { + "id": index_id, + "commit": commit_ref, + "dag": ctx.commit.dag, + "nodes": list(ctx.dag.nodes) if ctx.dag is not None else [], + "names": dict(ctx.dag.names) if ctx.dag is not None else {}, + "result": (ctx.dag.result if ctx.dag is not None else None), + "argv": (ctx.dag.argv if ctx.dag is not None else None), + "kwargv": (self._kwargv_ref_from_nodes(ctx.dag, txn) if ctx.dag is not None else None), + } + + @with_retry + def set_node_name(self, index_id: str, name: str, node_ref: Ref) -> Ref: + """Set or replace a node name in the index DAG.""" + require_ref(node_ref, ["node"], "set_node_name node_ref") + + def _build(old_commit: Ref, txn): + ctx = txn.get_commit_ctx(old_commit) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + if node_ref not in ctx.dag.nodes: + raise DmlRepoError("Node is not part of current DAG.") + ctx.dag.names[name] = node_ref + ctx.commit.dag = txn.put(ctx.dag) + ctx.commit.modified = now() + new_commit = txn.put(ctx.commit) + return node_ref, new_commit + + return self._retry_index_publication(index_id, _build) + + @with_retry + def put_import(self, index_id: str, dag: Ref, node: Optional[Ref] = None, name: Optional[str] = None) -> Ref: + """Import a node from another DAG into the current index DAG.""" + + def _build(old_commit: Ref, txn): + ctx = txn.get_commit_ctx(old_commit) + dag_obj: Dag = txn.get(dag) + imported_node = node if node is not None else dag_obj.result + if imported_node is None: + raise DmlRepoError("Cannot import from a DAG with no result node") + if dag == ctx.commit.dag: + raise DmlRepoError("Cannot import from the current DAG") + node_obj = ImportNode(dag, imported_node) + return self._put_node_retry(node_obj, name, old_commit, txn) + + return self._retry_index_publication(index_id, _build) + + @with_retry + def put_literal(self, index_id: str, value: Any, name: Optional[str] = None) -> Ref: + return self._retry_index_publication( + index_id, + lambda old_commit, txn: self._put_literal_retry( + value, + name=name, + txn=txn, + index_id=index_id, + old_commit=old_commit, + ), + ) + + @with_retry + def commit( + self, + index_id: str, + value: Ref | Error, + head: Optional[str] = None, + message: Optional[str] = None, + dag_name: Optional[str] = None, + ) -> Ref: + """Commit the current index state with the given value as the result node. + + Returns + ------- + Ref + Reference to the newly created commit. + + Raises + ------ + DmlRepoError + If the commit operation fails. + """ + head_ops = HeadOps(_db=self._db) + branch_name = head + old_commit = head_ops.get_index_commit(index_id) + branch_commit = head_ops.get_branch_commit(branch_name) if branch_name is not None else None + while True: + with self._tx(readonly=False) as txn: + ctx = txn.get_commit_ctx(old_commit) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + if isinstance(value, Error): + ctx.dag.error = txn.put(value) + else: + if value not in ctx.dag.nodes: + raise DmlRepoError("Value node is not part of DAG.") + ctx.dag.result = value + ctx.commit.dag = txn.put(ctx.dag) + if dag_name is not None: + ctx.tree.dags[dag_name] = ctx.commit.dag + ctx.commit.tree = txn.put(ctx.tree) + if message is not None: + ctx.commit.message = message + if branch_commit is not None: + ctx.commit.parents = [branch_commit] + ctx.commit.modified = now() + commit_ref = txn.put(ctx.commit) + if branch_name is None: + break + try: + assert branch_commit is not None + head_ops.update_branch_commit(branch_name, branch_commit, commit_ref) + break + except DmlPointerConflictError as err: + branch_commit = err.current_commit + head_ops.delete_index(index_id) + if ctx.dag.argv is not None: + # automatically cache the DAG if it has an argv (i.e. is runnable) + execution_id, _cache_key = get_current_execution_context() + if execution_id is None: + raise DmlRepoError("Execution id required for runnable DAG cache publication") + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + cops.put(ctx.commit.dag, execution_id=execution_id) + return commit_ref + + def current_dag_ref(self, index_id: str) -> Ref: + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + return cast(Ref, txn.get_commit_ctx(commit_ref).commit.dag) + + def resolve_dag_node(self, index_id: str, dag_name: str, node_name: Optional[str] = None) -> tuple[Ref, Ref]: + commit_ref = HeadOps(_db=self._db).get_index_commit(index_id) + with self._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(commit_ref) + tree: Tree = txn.get(ctx.commit.tree) + dag_ref = tree.dags.get(dag_name) + if dag_ref is None: + raise DmlRepoError(f"DAG '{dag_name}' not found") + dag: Dag = txn.get(dag_ref) + if node_name is None: + if dag.result is None: + raise DmlRepoError(f"DAG '{dag_name}' has no result node") + return dag_ref, dag.result + node_ref = dag.names.get(node_name) + if node_ref is None: + raise DmlRepoError(f"Node '{node_name}' not found in DAG '{dag_name}'") + return dag_ref, node_ref + + def _resolve_node_value_ref(self, node_ref: Ref, txn) -> Ref: + # Validate node ref using NodeOps then return its underlying datum ref + node_ref = NodeOps(_db=self._db)._require_node_ref(node_ref) + node = txn.get(node_ref) + return node.datum_ref(txn) + + def _put_node(self, node: Node, txn, index_id: str, name: Optional[str] = None) -> Ref: + if txn is not None: + old_commit = HeadOps(_db=self._db).get_index_commit(index_id) + ctx = txn.get_commit_ctx(old_commit) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + node_ref = self._put_node_in_ctx(ctx, txn, node, name=name) + ctx.commit.modified = now() + txn.put(ctx.commit, to=old_commit) + return node_ref + return self._retry_index_publication( + index_id, + lambda old_commit, retry_txn: self._put_node_retry( + node, + name, + old_commit, + retry_txn, + ), + ) + + def _create( + self, + *, + commit: Optional[Ref] = None, + author: Optional[str] = None, + argv: Optional[Ref] = None, # -> ArgvNode + txn, + ) -> Ref: + nodes: list[Ref] = [] + kw: dict[str, Any] = {"author": author or "DaggerML User"} + if commit is not None: + if argv is not None: + raise DmlRepoError("Cannot provide both commit and argv.") + base_ctx = txn.get_commit_ctx(commit) + kw.update({"parents": [commit], "tree": base_ctx.commit.tree}) + elif argv is not None: + argv_obj: ArgvNode = txn.get(argv) + if not isinstance(argv_obj, ArgvNode): + raise DmlRepoError("Argv node required") + nodes.append(argv) + nodes.append(self._kwargv_from_argv(argv, txn)) + kw.update({"parents": [], "tree": txn.put(Tree(dags={}))}) + else: + raise DmlRepoError("Either commit or argv must be provided.") + dag_ref = txn.put(Dag(nodes=nodes, names={}, result=None, argv=argv)) + return txn.put(Commit(message="", dag=dag_ref, **kw)) + + # ~~~~~~~~~~~ START_FN ~~~~~~~~~~~ + def _runnable_chain(self, runnable_ref: Ref, txn) -> list[tuple[Ref, RunnableDatum]]: + chain: list[tuple[Ref, RunnableDatum]] = [] + seen: set[Ref] = set() + current = runnable_ref + while True: + if current in seen: + raise DmlRepoError("Runnable sub cycle detected") + seen.add(current) + runnable: RunnableDatum = txn.get(current) + if not isinstance(runnable, RunnableDatum): + raise DmlRepoError("First arg must resolve to a Runnable datum") + chain.append((current, runnable)) + if runnable.sub is None: + break + current = runnable.sub + return chain + + def _innermost_runnable(self, runnable_ref: Ref, txn) -> RunnableDatum: + return self._runnable_chain(runnable_ref, txn)[-1][1] + + def _kwargv_from_argv(self, argv_ref: Ref, txn) -> Ref: + argv_node: ArgvNode = txn.get(argv_ref) + argv_datum: ListDatum = txn.get(argv_node.value) + if len(argv_datum.data) == 0: + raise DmlRepoError("argv is empty") + runnable_ref = argv_datum.data[0] + if runnable_ref.ns() != "datum-runnable": + raise DmlRepoError("First arg must resolve to a Runnable datum") + runnable = self._innermost_runnable(runnable_ref, txn) + return txn.put(KwargvNode(value=runnable.kwargs)) + + def _resolve_runnable_kwargs(self, runnable_ref: Ref, kwargv: dict[str, Ref], index_id: str, txn) -> Ref: + ctx = txn.get_commit_ctx(HeadOps(_db=self._db).get_index_commit(index_id)) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + chain = self._runnable_chain(runnable_ref, txn) + resolved: dict[Ref, dict[str, Ref]] = {} + for ref, runnable in chain: + kwargs_datum: DictDatum = txn.get(runnable.kwargs) + resolved[ref] = dict(kwargs_datum.data) + + for key, value in kwargv.items(): + require_ref(value, ["node"], "start_fn kwargv values") + if value not in ctx.dag.nodes: + raise DmlRepoError("kwargv nodes must be part of the current DAG.") + value_ref = self._resolve_node_value_ref(value, txn) + assigned = False + for ref, _runnable in reversed(chain): + if key in resolved[ref]: + resolved[ref][key] = value_ref + assigned = True + break + if not assigned: + raise DmlRepoError(f"Unknown kwarg: {key}") + + sub_ref: Optional[Ref] = None + for ref, runnable in reversed(chain): + kwargs_ref = txn.put(DictDatum(data=resolved[ref])) + sub_ref = txn.put( + RunnableDatum( + target=runnable.target, + sub=sub_ref, + kwargs=kwargs_ref, + adapter=runnable.adapter, + ) + ) + assert sub_ref is not None + return sub_ref + + def _prepare_fn( + self, + index_id: str, + argv: list[Ref], + kwargv: dict[str, Ref], + txn, + ctx=None, + ) -> Ref: + if len(argv) == 0: + raise DmlRepoError("argv is empty") + [require_ref(arg, ["node"], "start_fn argv elements") for arg in argv] + if ctx is None: + ctx = txn.get_commit_ctx(HeadOps(_db=self._db).get_index_commit(index_id)) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + if not set(argv).issubset(set(ctx.dag.nodes)): + raise DmlRepoError("All argv nodes must be part of current DAG.") + fn_datum_ref = self._resolve_node_value_ref(argv[0], txn) + if fn_datum_ref.nss()[0] != "datum": + raise DmlRepoError("First arg must resolve to a Datum.") + fn_datum: Datum = txn.get(fn_datum_ref) + if not isinstance(fn_datum, RunnableDatum): + raise DmlRepoError("First arg must resolve to a Runnable datum") + runnable_ref = self._resolve_runnable_kwargs(fn_datum_ref, kwargv, index_id, txn) + argv_ref = txn.put(ListDatum([runnable_ref, *[self._resolve_node_value_ref(arg, txn) for arg in argv[1:]]])) + return txn.put(ArgvNode(value=argv_ref)) + + def _run_builtin(self, argv_ref: Ref, txn) -> Optional[Ref]: + argv_node: ArgvNode = txn.get(argv_ref) + argv_datum: ListDatum = txn.get(argv_node.datum_ref(txn)) + if len(argv_datum.data) == 0: + raise DmlRepoError("argv is empty") + fn_runnable_ref = argv_datum.data[0] + if fn_runnable_ref.ns() != "datum-runnable": + raise DmlRepoError("First arg must resolve to a Runnable datum") + fn_runnable = self._innermost_runnable(fn_runnable_ref, txn) + if fn_runnable.adapter != "": + return None + fn_uri_obj: Uri = txn.get(fn_runnable.target) + if not isinstance(fn_uri_obj, Uri): + raise DmlRepoError("Runnable target must resolve to a Uri datum.") + fn_uri = fn_uri_obj.uri + fn_parsed = urlparse(fn_uri) + if fn_parsed.scheme != "daggerml": + raise DmlRepoError(f"Invalid builtin URI scheme: {fn_parsed.scheme}") + fpath = fn_parsed.path.lstrip("/") + if fpath not in BUILTIN_FNS: + raise DmlRepoError(f"Unknown builtin: {fn_parsed} -- path: {fpath}") + kwargv_datum: DictDatum = txn.get(fn_runnable.kwargs) + if kwargv_datum.data != {}: + raise DmlRepoError("Keyword arguments are not supported for builtin functions.") + node_ops = NodeOps(_db=self._db) + args = [node_ops._unroll_datum_ref(arg, txn) for arg in argv_datum.data[1:]] + result = BUILTIN_FNS[fpath](*args) + return self._build_scratch_dag_in_txn(argv_ref, txn, result=result) + + def _runnable_envelope(self, runnable_ref: Ref, txn, node_ops: NodeOps) -> dict[str, Any]: + runnable: RunnableDatum = txn.get(runnable_ref) + target: Uri = txn.get(runnable.target) + if not isinstance(target, Uri): + raise DmlRepoError("Runnable target must resolve to a Uri datum.") + kwargs_datum: DictDatum = txn.get(runnable.kwargs) + sub = None + if runnable.sub is not None: + sub = self._runnable_envelope(runnable.sub, txn, node_ops) + return { + "target": target.uri, + "kwargs": {k: node_ops._unroll_datum_ref(v, txn) for k, v in kwargs_datum.data.items()}, + "adapter": runnable.adapter, + "sub": sub, + } + + @staticmethod + def _validate_adapter_output(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise DmlRepoError("Adapter output schema invalid") + status = payload.get("status") + if status not in {"running", "succeeded", "failed", "cancel-detached"}: + raise DmlRepoError("Adapter output schema invalid") + if status == "succeeded": + allowed = {"status", "error", "dag_id"} + if not set(payload.keys()).issubset(allowed): + raise DmlRepoError("Adapter output schema invalid") + dag_id = payload.get("dag_id") + if not isinstance(dag_id, str) or not dag_id: + raise DmlRepoError("Adapter output schema invalid: succeeded requires dag_id") + if payload.get("error") is not None: + raise DmlRepoError("Adapter output schema invalid") + elif status == "failed": + if set(payload.keys()) != {"status", "error"}: + raise DmlRepoError("Adapter output schema invalid") + if payload.get("error") is None: + raise DmlRepoError("Adapter output schema invalid") + elif status == "cancel-detached": + if set(payload.keys()) != {"status", "error"}: + raise DmlRepoError("Adapter output schema invalid") + if payload.get("error") is not None: + raise DmlRepoError("Adapter output schema invalid") + else: + if set(payload.keys()) != {"status", "error", "state"}: + raise DmlRepoError("Adapter output schema invalid") + if payload.get("error") is not None: + raise DmlRepoError("Adapter output schema invalid") + if not isinstance(payload.get("state"), dict): + raise DmlRepoError("Adapter output schema invalid: running requires state") + return payload + + def _caller_identity(self, index_id: str, txn) -> tuple[str | None, str | None]: + caller_execution_id, caller_cache_key = get_current_execution_context() + if caller_execution_id: + return caller_execution_id, caller_cache_key + ctx = txn.get_commit_ctx(HeadOps(_db=self._db).get_index_commit(index_id)) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + return index_id, index_id + + def _prepare_adapter_call( + self, + index_id: str, + argv_ref: Ref, + txn, + *, + caller_execution_id: str | None = None, + caller_cache_key: str | None = None, + ) -> _PreparedAdapterCall: + argv_datum: ListDatum = txn.get(txn.get(argv_ref).datum_ref(txn)) + if len(argv_datum.data) == 0: + raise DmlRepoError("argv is empty") + fn_runnable_ref = argv_datum.data[0] + fn_runnable: RunnableDatum = txn.get(fn_runnable_ref) + if not isinstance(fn_runnable, RunnableDatum): + raise DmlRepoError("First arg must resolve to a Runnable datum") + adapter_path = shutil.which(fn_runnable.adapter) + if not adapter_path: + raise DmlRepoError(f"No such adapter: {fn_runnable.adapter}") + node_ops = NodeOps(_db=self._db) + if caller_execution_id is None and caller_cache_key is None: + caller_execution_id, caller_cache_key = self._caller_identity(index_id=index_id, txn=txn) + return _PreparedAdapterCall( + argv_ref=argv_ref, + adapter_path=adapter_path, + cache_key=argv_ref.id(), + runnable=self._runnable_envelope(fn_runnable_ref, txn, node_ops), + caller_execution_id=caller_execution_id, + caller_cache_key=caller_cache_key, + ) + + def _load_remote_dag(self, dag_id: str) -> Ref: + remote_ops = self._remote_ops() + dag_ref = remote_ops._decode_ref(remote_ops._remote_get_dag_ref(dag_id)) + return remote_ops.load_ptr(dag_ref["target"], expected_root_ns="dag") + + def _build_failed_execution_dag(self, argv_ref: Ref, error_message: str) -> Ref: + return self._build_scratch_dag(argv_ref, error=Error.from_ex(DmlRepoError(error_message))) + + def _publish_terminal_state(self, argv_ref: Ref, state: Mapping[str, Any], *, execution_id: str) -> None: + cops = CacheOps(_db=self._db, remote_root=self.remote_root) + if state["status"] == "succeeded": + dag_id = state.get("dag_id") + if not isinstance(dag_id, str) or not dag_id: + raise DmlRepoError("Execution state succeeded but dag_id is missing") + dag_ref = self._load_remote_dag(dag_id) + elif state["status"] == "failed": + dag_ref = self._build_failed_execution_dag(argv_ref, state.get("error") or "Adapter reported failure") + else: + raise DmlRepoError(f"Cannot publish non-terminal execution state: {state['status']}") + cops.put(dag_ref, execution_id=execution_id) + + @staticmethod + def _record_call_edges(prepared: _PreparedAdapterCall, state: ExecutionState, *, execution_id: str) -> None: + if prepared.caller_execution_id is None: + return + state.record_execution_dependency( + caller_execution_id=prepared.caller_execution_id, + callee_execution_id=execution_id, + ) + if prepared.caller_cache_key: + state.update_execution_record( + { + "execution_id": prepared.caller_execution_id, + "cache_key": prepared.caller_cache_key, + "lifecycle": "running", + "updated_at": int(time.time()), + "spawned_execution_ids": [execution_id], + "cancellation_requested_by": None, + } + ) + + def _ensure_index_execution_root(self, index_id: str) -> ExecutionRecord: + timestamp = int(time.time()) + return ExecutionState(index_id, remote_root=self.remote_root).update_execution_record( + { + "execution_id": index_id, + "cache_key": index_id, + "lifecycle": "running", + "updated_at": timestamp, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + ) + + def _mark_index_root_cancel_requested(self, index_id: str, *, requested_by: str) -> ExecutionRecord: + record = self._ensure_index_execution_root(index_id) + return ExecutionState(index_id, remote_root=self.remote_root).update_execution_record( + { + "execution_id": index_id, + "cache_key": cast(str, record["cache_key"]), + "lifecycle": "cancel-pending", + "updated_at": int(time.time()), + "spawned_execution_ids": list(cast(list[str], record.get("spawned_execution_ids", []))), + "cancellation_requested_by": requested_by, + } + ) + + def _resolve_index_for_cancellation(self, index_id: str) -> Path: + head_ops = HeadOps(_db=self._db) + live_path = head_ops._index_path(index_id) + cancelled_path = self._cancelled_index_path(head_ops, index_id) + with head_ops._pointer_lock(live_path): + if cancelled_path.exists(): + return cancelled_path + if not live_path.exists(): + raise DmlRepoError(f"Pointer does not exist: {live_path}") + cancelled_path.parent.mkdir(parents=True, exist_ok=True) + os.replace(live_path, cancelled_path) + return cancelled_path + + def _freeze_index_for_cancellation(self, index_id: str): + return self._resolve_index_for_cancellation(index_id) + + @staticmethod + def _cancelled_index_path(head_ops: HeadOps, index_id: str): + return head_ops._local_indexes_dir() / ".cancelled" / head_ops._validate_index_id(index_id) + + def _collect_cancellation_graph(self, index_id: str, root_record: ExecutionRecord) -> set[tuple[str, str]]: + graph: set[tuple[str, str]] = set() + pending = [ + (index_id, dependency_id) for dependency_id in cast(list[str], root_record.get("spawned_execution_ids", [])) + ] + seen: set[str] = set() + while pending: + caller_id, candidate_id = pending.pop() + graph.add((caller_id, candidate_id)) + if candidate_id in seen: + continue + seen.add(candidate_id) + record = ExecutionState(candidate_id, remote_root=self.remote_root).read_execution_record(candidate_id) + if record is None: + continue + pending.extend( + (candidate_id, dependency_id) + for dependency_id in cast(list[str], record.get("spawned_execution_ids", [])) + ) + return graph + + def _drop_live_caller_edges(self, caller_execution_id: str, callee_execution_ids: list[str]) -> None: + for callee_execution_id in callee_execution_ids: + ExecutionState(callee_execution_id, remote_root=self.remote_root).delete_execution_dependency( + caller_execution_id=caller_execution_id, + callee_execution_id=callee_execution_id, + ) + + def _cancel_execution_candidate( + self, + candidate_id: str, + *, + requested_by: str, + own_executions: set[str], + ) -> dict[str, Any]: + record_state = ExecutionState(candidate_id, remote_root=self.remote_root) + resolved_record = record_state.read_execution_record(candidate_id) + if resolved_record is None: + return { + "execution_id": candidate_id, + "outcome": -1, + "lock_retry": False, + "cancel_requested": False, + } + lock_state = ExecutionState(cast(str, resolved_record["cache_key"]), remote_root=self.remote_root) + if not lock_state.lock(): + return { + "execution_id": candidate_id, + "outcome": None, + "lock_retry": True, + "cancel_requested": False, + } + try: + record = lock_state.read_execution_record(candidate_id) + if record is None: + return { + "execution_id": candidate_id, + "outcome": -1, + "lock_retry": False, + "cancel_requested": False, + } + if record["lifecycle"] == "cancel-detached": + return { + "execution_id": candidate_id, + "cancel_requested": False, + "lock_retry": False, + "outcome": 1, + } + if record["lifecycle"] in {"succeeded", "failed"}: + return { + "execution_id": candidate_id, + "cancel_requested": False, + "lock_retry": False, + "outcome": -1, + } + inactive = {"cancel-pending", "cancel-detached", "succeeded", "failed"} + active_callers = self._active_callers(candidate_id, inactive) + if active_callers - own_executions: + return { + "execution_id": candidate_id, + "cancel_requested": False, + "lock_retry": False, + "outcome": -1, + } + lock_state.delete_active_execution() + pending_record = lock_state.update_execution_record( + { + "execution_id": candidate_id, + "cache_key": cast(str, record["cache_key"]), + "lifecycle": "cancel-pending", + "updated_at": int(time.time()), + "spawned_execution_ids": list(cast(list[str], record.get("spawned_execution_ids", []))), + "cancellation_requested_by": requested_by, + } + ) + finally: + lock_state.unlock() + + result = self._invoke_cancel_update(candidate_id, pending_record) + if result["status"] == "cancel-detached": + record_state.update_execution_record( + {**pending_record, "lifecycle": "cancel-detached", "updated_at": int(time.time())} + ) + self._drop_live_caller_edges( + candidate_id, + cast(list[str], pending_record.get("spawned_execution_ids", [])), + ) + record_state.create_cancellation_tombstone( + execution_id=candidate_id, + cache_key=cast(str, pending_record["cache_key"]), + requested_by=requested_by, + requested_at=int(time.time()), + ) + return { + "execution_id": candidate_id, + "cancel_requested": True, + "lock_retry": False, + "outcome": 1, + } + return { + "execution_id": candidate_id, + "cancel_requested": True, + "lock_retry": False, + "outcome": None, + } + + def _active_callers(self, execution_id: str, inactive: set[str]) -> set[str]: + callers: set[str] = set() + state = ExecutionState(execution_id, remote_root=self.remote_root) + for caller_id in state.list_execution_callers(execution_id): + caller_record = ExecutionState(caller_id, remote_root=self.remote_root).read_execution_record(caller_id) + if caller_record is not None and caller_record["lifecycle"] not in inactive: + callers.add(caller_id) + return callers + + def _owned_executions_cancelled(self, own_executions: set[str]) -> bool: + for execution_id in own_executions: + record = ExecutionState(execution_id, remote_root=self.remote_root).read_execution_record(execution_id) + if record is None or record["lifecycle"] != "cancel-detached": + return False + return True + + def _complete_index_cancellation(self, index_id: str, *, cancelled_path: Path, own_executions: set[str]) -> None: + if not self._owned_executions_cancelled(own_executions): + raise DmlRepoError(f"Cancellation incomplete for index: {index_id}") + root_state = ExecutionState(index_id, remote_root=self.remote_root) + root_record = root_state.read_execution_record(index_id) + if root_record is None: + raise DmlRepoError(f"Missing synthetic root execution record: {index_id}") + root_state.update_execution_record( + {**root_record, "lifecycle": "cancel-detached", "updated_at": int(time.time())} + ) + try: + cancelled_path.unlink() + except FileNotFoundError: + pass + + @staticmethod + def _is_synthetic_index_root(record: ExecutionRecord) -> bool: + return record["execution_id"] == record["cache_key"] + + def _invoke_cancel_update(self, execution_id: str, record: ExecutionRecord) -> dict[str, Any]: + argv_ref = Ref(f"node-argv:{cast(str, record['cache_key'])}") + with self._tx(readonly=True) as txn: + prepared = self._prepare_adapter_call( + execution_id, + argv_ref, + txn, + caller_execution_id=execution_id, + caller_cache_key=cast(str, record["cache_key"]), + ) + argv_ptr = self._remote_ops().put_ref_manifest(prepared.argv_ref) + launch_state = ExecutionState( + cast(str, record["cache_key"]), remote_root=self.remote_root + ).read_launch_state(execution_id) + return self._call_adapter( + prepared, + argv_ptr, + execution_id=execution_id, + state=(cast(dict[str, Any] | None, launch_state["resume_state"]) if launch_state is not None else None), + execution_status="cancel-pending", + cancel_requested_by=cast(str | None, record.get("cancellation_requested_by")), + ) + + def _call_adapter( + self, + prepared: _PreparedAdapterCall, + argv_ptr: str, + *, + execution_id: str, + state: dict[str, Any] | None, + execution_status: str | None, + cancel_requested_by: str | None, + ) -> dict[str, Any]: + envelope = { + "argv_ptr": argv_ptr, + "cache_key": prepared.cache_key, + "execution_id": execution_id, + "remote": { + "root": self.remote_root, + }, + "runnable": prepared.runnable, + "state": state, + "execution_status": execution_status, + "cancel_requested_by": cancel_requested_by, + } + cmd = [prepared.adapter_path] + if prepared.adapter_path.endswith(".py"): + cmd = [sys.executable, prepared.adapter_path] + result_data = run( + cmd, + input=json.dumps(envelope, default=lambda x: x.uri if isinstance(x, Uri) else x), + capture_output=True, + text=True, + ) + if result_data.returncode != 0: + raise DmlRepoError(f"Adapter call failed: {result_data.stderr}") + try: + stdout = json.loads(result_data.stdout) + except json.JSONDecodeError as e: + raise DmlRepoError("Adapter output must be JSON") from e + return self._validate_adapter_output(stdout) + + def _finish_fn_result_in_txn( + self, dag_ref: Ref, argv: list[Ref], name: Optional[str], txn, index_id: str + ) -> tuple[Ref, Error | None]: + del txn + out = self._finish_fn_result(dag_ref, argv, name, None, index_id) + with self._tx(readonly=True) as read_txn: + dag_obj: Dag = read_txn.get(dag_ref) + if dag_obj.result is None and dag_obj.error is None: + raise DmlRepoError("Function DAG has no result node.") + if dag_obj.error is not None: + err = read_txn.get(dag_obj.error) + if not isinstance(err, Error): + raise DmlRepoError(f"Expected Error object, got: {type(err).__name__}") + return out, err + return out, None + + def _finish_fn_result(self, dag_ref: Ref, argv: list[Ref], name: Optional[str], txn, index_id: str) -> Ref: + del txn + out = self._retry_index_publication( + index_id, + lambda old_commit, retry_txn: self._put_node_retry(FnNode(argv, dag_ref), name, old_commit, retry_txn), + ) + with self._tx(readonly=True) as read_txn: + dag_obj: Dag = read_txn.get(dag_ref) + if dag_obj.result is None and dag_obj.error is None: + raise DmlRepoError("Function DAG has no result node.") + if dag_obj.error is not None: + err = read_txn.get(dag_obj.error) + if not isinstance(err, Error): + raise DmlRepoError(f"Expected Error object, got: {type(err).__name__}") + raise err + return out + + def _put_literal(self, value: Any, txn, index_id: str, name: Optional[str] = None, idx_ctx=None) -> Ref: + if idx_ctx is None: + idx_ctx = txn.get_commit_ctx(HeadOps(_db=self._db).get_index_commit(index_id)) + if idx_ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + + def _put(x) -> Ref: + if isinstance(x, Ref): + if not txn.exists(x): + raise DmlRepoError(f"Referenced object does not exist: {x}") + if x.nss()[0] == "node": + if x not in idx_ctx.dag.nodes: + raise DmlRepoError(f"Referenced node is not part of DAG: {x}") + return x + if x.nss()[0] == "datum": + return x + raise DmlRepoError(f"Invalid reference namespace for literal value: {x.ns()}") + if isinstance(x, Runnable): + target_ref = _put(x.target) + if target_ref.nss()[0] == "node": + target_ref = self._resolve_node_value_ref(target_ref, txn) + if target_ref.ns() != "datum-uri": + raise DmlRepoError("Runnable target must resolve to a Uri datum.") + sub_ref = None + if x.sub is not None: + sub_ref = _put(x.sub) + if sub_ref.nss()[0] == "node": + sub_ref = self._resolve_node_value_ref(sub_ref, txn) + if sub_ref.ns() != "datum-runnable": + raise DmlRepoError("Runnable sub must resolve to a Runnable datum.") + kwargs_data = {} + for key, value in x.kwargs.items(): + if not isinstance(key, str): + raise DmlRepoError("Runnable kwargs keys must be strings.") + value_ref = _put(value) + if value_ref.nss()[0] == "node": + value_ref = self._resolve_node_value_ref(value_ref, txn) + if value_ref.nss()[0] != "datum": + raise DmlRepoError("Runnable kwargs values must resolve to datum refs.") + kwargs_data[key] = value_ref + kwargs_ref = txn.put(DictDatum(data=kwargs_data)) + return txn.put(RunnableDatum(target=target_ref, sub=sub_ref, kwargs=kwargs_ref, adapter=x.adapter)) + if isinstance(x, Datum): + return txn.put(x) + if isinstance(x, set): + raise DmlRepoError("Set literals are not supported.") + if isinstance(x, tuple): + x = list(x) + if isinstance(x, list): + ys = [_put(v) for v in x] + if any(isinstance(v, Ref) and v.nss()[0] == "node" for v in ys): + ys = [ + self._put_literal(v, txn, index_id, idx_ctx=idx_ctx) if v.nss()[0] != "node" else v for v in ys + ] + fn_uri = txn.put(Uri("daggerml:list")) + fn_kwargs = txn.put(DictDatum(data={})) + fn = self._put_literal( + RunnableDatum(target=fn_uri, sub=None, kwargs=fn_kwargs, adapter=""), + txn, + index_id, + idx_ctx=idx_ctx, + ) + argv_refs = [fn, *ys] + argv_ref = self._prepare_fn(index_id, argv_refs, {}, txn, ctx=idx_ctx) + dag_ref = self._run_builtin(argv_ref, txn) + assert dag_ref is not None + dag_obj: Dag = txn.get(dag_ref) + if dag_obj.result is None and dag_obj.error is None: + raise DmlRepoError("Function DAG has no result node.") + resp = self._put_node_in_ctx(idx_ctx, txn, FnNode(argv_refs, dag_ref)) + if dag_obj.error is not None: + raise txn.get(dag_obj.error) + return resp + return txn.put(ListDatum(ys)) + if isinstance(x, dict): + ys = {k: _put(v) for k, v in x.items()} + if any(isinstance(v, Ref) and v.nss()[0] == "node" for v in ys.values()): + yks = [self._put_literal(k, txn, index_id, idx_ctx=idx_ctx) for k in ys.keys()] + yvs = [ + self._put_literal(v, txn, index_id, idx_ctx=idx_ctx) if v.nss()[0] != "node" else v + for v in ys.values() + ] + fn_uri = txn.put(Uri("daggerml:dict")) + fn_kwargs = txn.put(DictDatum(data={})) + fn = self._put_literal( + RunnableDatum(target=fn_uri, sub=None, kwargs=fn_kwargs, adapter=""), + txn, + index_id, + idx_ctx=idx_ctx, + ) + argv_refs = [fn, *unnest(zip(yks, yvs, strict=True))] + argv_ref = self._prepare_fn(index_id, argv_refs, {}, txn, ctx=idx_ctx) + dag_ref = self._run_builtin(argv_ref, txn) + assert dag_ref is not None + dag_obj: Dag = txn.get(dag_ref) + if dag_obj.result is None and dag_obj.error is None: + raise DmlRepoError("Function DAG has no result node.") + resp = self._put_node_in_ctx(idx_ctx, txn, FnNode(argv_refs, dag_ref)) + if dag_obj.error is not None: + raise txn.get(dag_obj.error) + return resp + return txn.put(DictDatum(ys)) + return txn.put(ScalarDatum(x)) + + result_ref = _put(value) + if result_ref.nss()[0] == "node": + if name is not None: + idx_ctx.dag.nodes = sorted({result_ref, *idx_ctx.dag.nodes}) + idx_ctx.dag.names[name] = result_ref + idx_ctx.commit.dag = txn.put(idx_ctx.dag) + idx_ctx.commit.modified = now() + return result_ref + # Create literal node directly in transaction + node_ref = txn.put(LiteralNode(value=result_ref)) + idx_ctx.dag.nodes = sorted({node_ref, *idx_ctx.dag.nodes}) + if name is not None: + idx_ctx.dag.names[name] = node_ref + idx_ctx.commit.dag = txn.put(idx_ctx.dag) + return node_ref + + def _put_literal_retry( + self, + value: Any, + txn, + index_id: str, + old_commit: Ref, + name: Optional[str] = None, + ) -> tuple[Ref, Ref]: + idx_ctx = txn.get_commit_ctx(old_commit) + node_ref = self._put_literal(value, txn, index_id, name=name, idx_ctx=idx_ctx) + idx_ctx.commit.modified = now() + return node_ref, txn.put(idx_ctx.commit) + + def _put_node_retry(self, node: Node, name: Optional[str], old_commit: Ref, txn) -> tuple[Ref, Ref]: + ctx = txn.get_commit_ctx(old_commit) + if ctx.dag is None: + raise DmlRepoError("Index commit has no DAG.") + node_ref = self._put_node_in_ctx(ctx, txn, node, name=name) + ctx.commit.modified = now() + return node_ref, txn.put(ctx.commit) + + def _put_node_in_ctx(self, ctx, txn, node: Node, name: Optional[str] = None) -> Ref: + node_ref = txn.put(node) + ctx.dag.nodes = sorted({node_ref, *ctx.dag.nodes}) + if name is not None: + ctx.dag.names[name] = node_ref + ctx.commit.dag = txn.put(ctx.dag) + return node_ref + + def _retry_index_publication(self, index_id: str, build): + head_ops = HeadOps(_db=self._db) + old_commit = head_ops.get_index_commit(index_id) + while True: + with self._tx(readonly=False) as txn: + result, new_commit = build(old_commit, txn) + try: + head_ops.update_index_commit(index_id, old_commit, new_commit) + return result + except DmlPointerConflictError as err: + old_commit = err.current_commit + + def _store_scratch_value(self, value: Any, txn) -> Ref: + if isinstance(value, Datum): + return txn.put(value) + if isinstance(value, Runnable): + target_ref = self._store_scratch_value(value.target, txn) + if target_ref.ns() != "datum-uri": + raise DmlRepoError("Runnable target must resolve to a Uri datum.") + sub_ref = None + if value.sub is not None: + sub_ref = self._store_scratch_value(value.sub, txn) + if sub_ref.ns() != "datum-runnable": + raise DmlRepoError("Runnable sub must resolve to a Runnable datum.") + kwargs_ref = self._store_scratch_value(value.kwargs, txn) + if kwargs_ref.ns() != "datum-dict": + raise DmlRepoError("Runnable kwargs must resolve to a Dict datum.") + return txn.put(RunnableDatum(target=target_ref, sub=sub_ref, kwargs=kwargs_ref, adapter=value.adapter)) + if isinstance(value, set): + raise DmlRepoError("Set literals are not supported.") + if isinstance(value, tuple): + value = list(value) + if isinstance(value, list): + return txn.put(ListDatum([self._store_scratch_value(v, txn) for v in value])) + if isinstance(value, dict): + return txn.put(DictDatum(data={k: self._store_scratch_value(v, txn) for k, v in value.items()})) + return txn.put(ScalarDatum(value)) + + def _build_scratch_dag(self, argv_ref: Ref, *, result: Any = None, error: Error | None = None) -> Ref: + with self._tx(readonly=False) as txn: + return self._build_scratch_dag_in_txn(argv_ref, txn, result=result, error=error) + + def _build_scratch_dag_in_txn(self, argv_ref: Ref, txn, *, result: Any = None, error: Error | None = None) -> Ref: + nodes = [argv_ref, self._kwargv_from_argv(argv_ref, txn)] + dag = Dag(nodes=nodes, names={}, result=None, argv=argv_ref) + if error is not None: + dag.error = txn.put(error) + else: + result_ref = txn.put(LiteralNode(value=self._store_scratch_value(result, txn))) + dag.nodes = sorted({result_ref, *dag.nodes}) + dag.result = result_ref + dag_ref = txn.put(dag) + tree_ref = txn.put(Tree(dags={})) + txn.put(Commit(message="", dag=dag_ref, parents=[], tree=tree_ref, author="DaggerML User")) + return dag_ref diff --git a/src/daggerml/_internal/ops/node.py b/src/daggerml/_internal/ops/node.py new file mode 100644 index 0000000..696e214 --- /dev/null +++ b/src/daggerml/_internal/ops/node.py @@ -0,0 +1,135 @@ +"""Node operations for retrieving and inspecting DAG nodes. + +This module provides NodeOps, a small helper subsystem for working with node +objects in the repository. It can retrieve a node's value one-layer deep, or +fully unroll nested Datum references into plain Python values. + +Public API: + NodeOps - Class for node inspection operations +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.types import ( + Datum, + DictDatum, + DmlRepoError, + FnNode, + ImportNode, + ListDatum, + Node, + Runnable, + RunnableDatum, + ScalarDatum, + Uri, +) + + +@dataclass +class NodeOps(BaseOps): + """Operations for retrieving and inspecting node values.""" + + def _require_node_ref(self, node_ref: Ref) -> Ref: + if not isinstance(node_ref, Ref): + raise DmlRepoError(f"Expected Ref, got: {type(node_ref).__name__}") + if node_ref.nss()[0] != "node": + raise DmlRepoError(f"Expected node ref, got: {node_ref}") + return node_ref + + def _unroll_datum_ref(self, ref: Ref, txn, *, _stack: set[Ref] | None = None) -> Any: + if ref.ns() == "error": + raise DmlRepoError("Cannot unroll error value.") + if ref.nss()[0] != "datum": + raise DmlRepoError(f"Expected datum ref, got: {ref}") + + stack = _stack if _stack is not None else set() + if ref in stack: + raise DmlRepoError(f"Cycle detected while unrolling datum: {ref}") + + stack.add(ref) + try: + datum: Datum = txn.get(ref) + if isinstance(datum, ScalarDatum): + return datum.data + if isinstance(datum, ListDatum): + return [self._unroll_datum_ref(x, txn, _stack=stack) for x in datum.data] + if isinstance(datum, DictDatum): + return {k: self._unroll_datum_ref(v, txn, _stack=stack) for k, v in datum.data.items()} + if isinstance(datum, Uri): + return datum + if isinstance(datum, RunnableDatum): + target = self._unroll_datum_ref(datum.target, txn, _stack=stack) + kwargs_datum: DictDatum = txn.get(datum.kwargs) + kwargs = {k: self._unroll_datum_ref(v, txn, _stack=stack) for k, v in kwargs_datum.data.items()} + sub = None + if datum.sub is not None: + sub_obj = self._unroll_datum_ref(datum.sub, txn, _stack=stack) + if not isinstance(sub_obj, Runnable): + raise DmlRepoError(f"Runnable sub must unroll to Runnable, got {type(sub_obj).__name__}") + sub = sub_obj + if not isinstance(target, Uri): + raise DmlRepoError(f"Runnable target must unroll to Uri, got {type(target).__name__}") + return Runnable(target=target, sub=sub, kwargs=kwargs, adapter=datum.adapter) + raise DmlRepoError(f"Unsupported datum type: {type(datum).__name__}") + finally: + stack.remove(ref) + + def get(self, node_ref: Ref) -> Any: + """Retrieve node value/content one layer deep (refs preserved in collections).""" + try: + node_ref = self._require_node_ref(node_ref) + with self._tx(readonly=True) as txn: + node: Node = txn.get(node_ref) + value_ref = node.datum_ref(txn) + datum: Datum = txn.get(value_ref) + if isinstance(datum, ScalarDatum): + return datum.data + if isinstance(datum, ListDatum): + return list(datum.data) + if isinstance(datum, DictDatum): + return dict(datum.data) + if isinstance(datum, Uri): + return datum + if isinstance(datum, RunnableDatum): + return self._unroll_datum_ref(value_ref, txn) + raise DmlRepoError(f"Unsupported datum type: {type(datum).__name__}") + except Exception as e: + raise DmlRepoError(f"Failed to get node value: {e}") from e + + def unroll(self, node_ref: Ref) -> Any: + """Fully realize Python object without any datum refs.""" + try: + node_ref = self._require_node_ref(node_ref) + with self._tx(readonly=True) as txn: + node: Node = txn.get(node_ref) + value_ref = node.datum_ref(txn) + return self._unroll_datum_ref(value_ref, txn) + except Exception as e: + raise DmlRepoError(f"Failed to unroll node value: {e}") from e + + def describe(self, node_ref: Ref) -> dict[str, Any]: + """Describe a node with stable metadata fields.""" + try: + node_ref = self._require_node_ref(node_ref) + with self._tx(readonly=True) as txn: + node: Node = txn.get(node_ref) + payload: dict[str, Any] = { + "id": node_ref.id(), + "ref": node_ref, + "type": type(node).__name__, + "value_ref": node.datum_ref(txn), + } + if isinstance(node, FnNode): + payload["dag"] = node.dag + payload["argv"] = list(node.argv) + if isinstance(node, ImportNode): + payload["dag"] = node.dag + payload["node"] = node.node + return payload + except Exception as e: + raise DmlRepoError(f"Failed to describe node: {e}") from e diff --git a/src/daggerml/_internal/ops/remote.py b/src/daggerml/_internal/ops/remote.py new file mode 100644 index 0000000..a9020a6 --- /dev/null +++ b/src/daggerml/_internal/ops/remote.py @@ -0,0 +1,1970 @@ +"""Remote operations for CAS + refs backed by S3. + +This module provides RemoteOps, a class that handles pushing and pulling +repository state to/from S3-backed remote storage. +""" + +import base64 +import hashlib +import json +import re +import sqlite3 +import time +from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait +from dataclasses import dataclass, field, fields, is_dataclass +from functools import wraps +from typing import Any, Literal, cast + +import boto3 +from botocore.config import Config + +from daggerml._internal._db import Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.revision_uri import ( + RevisionUri, + canonicalize_revision_uri, + parse_revision_uri, + validate_ref_name, + validate_segment, +) +from daggerml._internal.types import Commit, DmlRepoError, Tree + + +def _get_s3_client(): + return boto3.client("s3", config=Config(max_pool_connections=20)) + + +class RemoteError(Exception): + """Base exception for remote operations.""" + + pass + + +class RefAlreadyExists(Exception): + """Raised when attempting to create a ref that already exists.""" + + pass + + +class RefUpdateConflict(Exception): + """Raised when a conditional mutable ref update loses a race.""" + + pass + + +class InvalidOid(Exception): + """Raised when an object ID is invalid.""" + + pass + + +class InvalidManifest(Exception): + """Raised when a manifest is invalid.""" + + pass + + +class InvalidRef(Exception): + """Raised when a ref is invalid.""" + + pass + + +class MissingCasObject(Exception): + """Raised when a CAS object is missing.""" + + pass + + +class ShaMismatch(Exception): + """Raised when SHA256 verification fails.""" + + pass + + +@dataclass(frozen=True) +class _ManifestFetchResult: + manifest_oid: str + manifest: dict + + +@dataclass(frozen=True) +class _DagRefFetchResult: + dag_id: str + manifest_oid: str + + +@dataclass(frozen=True) +class _CasFetchResult: + ns: str + oid: str + raw_bytes: bytes + + +@dataclass(frozen=True) +class RemoteRefRead: + """Decoded remote ref with its object ETag for conditional updates.""" + + ref: dict + etag: str | None + + +DmlProjectUri = RevisionUri + + +_REMOTE_FETCH_WORKERS_DEFAULT = 16 + + +def _resolve_fetch_workers(configured: int) -> int: + if not isinstance(configured, int) or configured <= 0: + raise ValueError("Remote fetch workers must be a positive integer") + return configured + + +def _remote_boundary(action: str): + """Convert public remote-operation failures into DmlRepoError.""" + + def _decorate(fn): + @wraps(fn) + def _wrapped(self, *args, **kwargs): + try: + return fn(self, *args, **kwargs) + except DmlRepoError: + raise + except Exception as exc: + raise DmlRepoError(f"Remote {action} failed: {exc}") from exc + + return _wrapped + + return _decorate + + +@dataclass +class RemoteOps(BaseOps): + """Remote operations for CAS + refs backed by S3. + + This class provides methods to push and pull repository state + between local storage and remote S3-backed storage. + """ + + bucket: str + prefix: str + fetch_workers: int = _REMOTE_FETCH_WORKERS_DEFAULT + client: Any = field(default_factory=_get_s3_client) + _IO_INVOKE_PRUNE_AGE_SECONDS: int = 24 * 3600 + + @_remote_boundary("initialization") + def __post_init__(self): + if not isinstance(self.bucket, str) or not self.bucket: + raise ValueError("Remote bucket is required") + if not isinstance(self.prefix, str): + raise ValueError("Remote prefix must be a string") + self.fetch_workers = _resolve_fetch_workers(self.fetch_workers) + self._ensure_remote_descriptor() + super().__post_init__() + + def __put(self, key, value, **kwargs): + self.client.put_object(Bucket=self.bucket, Key=key, Body=value, **kwargs) + self.client.get_waiter("object_exists").wait(Bucket=self.bucket, Key=key) + + def _prefixed_key(self, relative_key: str) -> str: + """Join configured prefix and relative key without leading slash when prefix is empty.""" + return f"{self.prefix}/{relative_key}" if self.prefix else relative_key + + def _ensure_remote_descriptor(self) -> None: + """Ensure the remote prefix has a valid dml.json descriptor file. + + Creates the descriptor if missing. + If present but invalid, this is a hard failure. + """ + descriptor_key = f"{self.prefix}/dml.json" if self.prefix else "dml.json" + expected_descriptor = { + "schema": 0, + "hash": "sha256", + "layout": "cas+refs", + "refs_prefix": "refs", + "io_prefix": "io", + "cas_prefix": "cas/sha256", + } + try: + # Try to get existing descriptor + response = self.client.get_object(Bucket=self.bucket, Key=descriptor_key) + descriptor = json.loads(response["Body"].read().decode("utf-8")) + # Check if it matches expected descriptor + if descriptor != expected_descriptor: + raise InvalidRef("Invalid remote descriptor") + except self.client.exceptions.NoSuchKey: + # Descriptor doesn't exist, create it + descriptor_json = json.dumps(expected_descriptor, separators=(",", ":"), sort_keys=True) + self.__put(descriptor_key, descriptor_json.encode("utf-8"), ContentType="application/json") + + @staticmethod + def _validate_cache_key(cache_key: str) -> str: + if not isinstance(cache_key, str) or not cache_key: + raise ValueError("Invalid cache key: must be a non-empty string") + if cache_key in {".", ".."} or "/" in cache_key or "\\" in cache_key: + raise ValueError(f"Invalid cache key: {cache_key!r}") + return cache_key + + @staticmethod + def _validate_project_segment(label: str, value: str) -> str: + return validate_segment(f"project {label}", value) + + @staticmethod + def _validate_ref_name(label: str, value: str) -> str: + return validate_ref_name(label, value) + + @classmethod + def parse_dml_uri(cls, uri: str, *, require_identifier: bool = False) -> DmlProjectUri: + return parse_revision_uri(uri, require_identifier=require_identifier) + + @classmethod + def canonical_dml_uri(cls, uri: str, *, require_identifier: bool = False) -> str: + return canonicalize_revision_uri(uri, require_identifier=require_identifier) + + def _project_branch_ref_path(self, owner: str, project: str, branch: str) -> str: + owner = self._validate_project_segment("owner", owner) + project = self._validate_project_segment("name", project) + branch = self._validate_ref_name("branch", branch) + return f"projects/{owner}/{project}/heads/{branch}.json" + + def _project_tag_ref_path(self, owner: str, project: str, tag: str) -> str: + owner = self._validate_project_segment("owner", owner) + project = self._validate_project_segment("name", project) + tag = self._validate_ref_name("tag", tag) + return f"projects/{owner}/{project}/tags/{tag}.json" + + def _dml_uri_ref_path(self, uri: str) -> str: + parsed = self.parse_dml_uri(uri, require_identifier=True) + if parsed.branch is not None: + return self._project_branch_ref_path(parsed.owner, parsed.project, parsed.branch) + if parsed.tag is not None: + return self._project_tag_ref_path(parsed.owner, parsed.project, parsed.tag) + raise ValueError(f"DML URI must include a branch or tag: {uri!r}") + + @staticmethod + def _validate_manifest_oid(manifest_oid: str) -> str: + if not isinstance(manifest_oid, str) or not re.match(r"^[0-9a-f]{64}$", manifest_oid): + raise InvalidOid(f"Invalid OID: must be 64 lowercase hex characters, got {manifest_oid!r}") + return manifest_oid + + @staticmethod + def _validate_dag_id(dag_id: str) -> str: + if not isinstance(dag_id, str) or not re.match(r"^[0-9a-f]{64}$", dag_id): + raise ValueError(f"Invalid DAG id: must be 64 lowercase hex characters, got {dag_id!r}") + return dag_id + + def _dag_ref_path(self, dag_id: str) -> str: + dag_id = self._validate_dag_id(dag_id) + return f"dags/{dag_id}.json" + + def _dag_ref_key(self, dag_id: str) -> str: + return self._prefixed_key(f"refs/{self._dag_ref_path(dag_id)}") + + def _cache_ref_path(self, cache_key: str) -> str: + cache_key = self._validate_cache_key(cache_key) + return f"cache/{cache_key}.json" + + def _cas_key(self, oid: str) -> str: + """Generate CAS key for object ID with sharding. + + Parameters + ---------- + oid : str + Object ID (64-character lowercase hex string) + + Returns + ------- + str + S3 key for the CAS object + + Raises + ------ + InvalidOid + If oid is not a valid 64-character lowercase hex string + """ + if not re.match(r"^[0-9a-f]{64}$", oid): + raise InvalidOid(f"Invalid OID: must be 64 lowercase hex characters, got {oid!r}") + aa = oid[:2] + bb = oid[2:4] + return self._prefixed_key(f"cas/sha256/{aa}/{bb}/{oid}") + + def _ref_key(self, ref_path: str) -> str: + """Generate ref key for reference path. + + Parameters + ---------- + ref_path : str + Reference path + + Returns + ------- + str + S3 key for the ref + + Raises + ------ + ValueError + If ref_path contains path traversal sequences + """ + if ref_path.startswith("/"): + raise ValueError(f"Invalid ref path: cannot start with '/', got {ref_path!r}") + segments = ref_path.split("/") + if not segments or any(seg == "" for seg in segments): + raise ValueError(f"Invalid ref path: empty path segment in {ref_path!r}") + if any(seg in {".", ".."} for seg in segments): + raise ValueError(f"Invalid ref path: forbidden path segment in {ref_path!r}") + if any("\\" in seg for seg in segments): + raise ValueError(f"Invalid ref path: path segments must not contain '\\\\': {ref_path!r}") + + # Only tags/cache/project refs are valid protocol refs. + root = segments[0] + if root not in {"tags", "cache", "projects"}: + raise ValueError(f"Invalid ref path root: expected 'tags' or 'cache' or 'projects', got {root!r}") + + if root == "tags": + if len(segments) != 3 or not segments[2].endswith(".json"): + raise ValueError("Invalid tags ref path: expected tags//.json") + name = segments[1] + version = segments[2][: -len(".json")] + seg_re = r"^[a-z0-9][a-z0-9._-]{0,127}$" + if not re.match(seg_re, name): + raise ValueError(f"Invalid tag name: {name!r}") + if not re.match(seg_re, version): + raise ValueError(f"Invalid tag version: {version!r}") + elif root == "cache": + if len(segments) != 2 or not segments[1].endswith(".json"): + raise ValueError("Invalid cache ref path: expected cache/.json") + cache_key = segments[1][: -len(".json")] + self._validate_cache_key(cache_key) + else: + if len(segments) < 5 or segments[1] == "" or segments[2] == "" or segments[3] not in {"heads", "tags"}: + raise ValueError( + "Invalid project ref path: expected projects///{heads,tags}/.json" + ) + if not segments[-1].endswith(".json"): + raise ValueError("Invalid project ref path: expected .json filename") + owner = segments[1] + project = segments[2] + name = "/".join([*segments[4:-1], segments[-1][: -len(".json")]]) + self._validate_project_segment("owner", owner) + self._validate_project_segment("name", project) + self._validate_ref_name("branch" if segments[3] == "heads" else "tag", name) + return self._prefixed_key(f"refs/{ref_path}") + + def _remote_has_cas(self, oid: str) -> bool: + """Check if CAS object exists in remote storage. + + Parameters + ---------- + oid : str + Object ID (64-character lowercase hex string) + + Returns + ------- + bool + True if the object exists, False otherwise + """ + try: + self.client.head_object(Bucket=self.bucket, Key=self._cas_key(oid)) + return True + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("NoSuchKey", "404"): + return False + raise + + def _remote_get_cas(self, oid: str) -> bytes: + """Get CAS object data from remote storage. + + Parameters + ---------- + oid : str + Object ID (64-character lowercase hex string) + + Returns + ------- + bytes + The object data + + Raises + ------ + MissingCasObject + If the object does not exist + """ + try: + response = self.client.get_object(Bucket=self.bucket, Key=self._cas_key(oid)) + return response["Body"].read() + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("NoSuchKey", "404"): + raise MissingCasObject(f"CAS object {oid} not found") from None + raise + + def _remote_put_cas(self, oid: str, data: bytes) -> None: + """Put CAS object data to remote storage. + + Parameters + ---------- + oid : str + Object ID (64-character lowercase hex string) + data : bytes + The object data to store + """ + self.__put(self._cas_key(oid), data) + + def _remote_get_ref(self, ref_path: str) -> bytes: + """Get ref data from remote storage. + + Parameters + ---------- + ref_path : str + Reference path + + Returns + ------- + bytes + The ref data + + Raises + ------ + RemoteError + If the ref does not exist + """ + try: + response = self.client.get_object(Bucket=self.bucket, Key=self._ref_key(ref_path)) + return response["Body"].read() + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("NoSuchKey", "404"): + raise RemoteError(f"Ref {ref_path} not found") from None + raise + + def _remote_get_ref_with_etag(self, ref_path: str) -> RemoteRefRead: + try: + response = self.client.get_object(Bucket=self.bucket, Key=self._ref_key(ref_path)) + return RemoteRefRead(self._decode_ref(response["Body"].read()), response.get("ETag")) + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("NoSuchKey", "404"): + raise RemoteError(f"Ref {ref_path} not found") from None + raise + + def _remote_get_dag_ref(self, dag_id: str) -> bytes: + dag_id = self._validate_dag_id(dag_id) + ref_path = self._dag_ref_path(dag_id) + try: + response = self.client.get_object(Bucket=self.bucket, Key=self._dag_ref_key(dag_id)) + return response["Body"].read() + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("NoSuchKey", "404"): + raise RemoteError(f"Ref {ref_path} not found") from None + raise + + def _remote_put_ref(self, ref_path: str, data: bytes) -> None: + """Put ref data to remote storage. + + Parameters + ---------- + ref_path : str + Reference path + data : bytes + The ref data to store + + Raises + ------ + RefAlreadyExists + If the ref already exists + """ + # Check if ref already exists + try: + self.client.head_object(Bucket=self.bucket, Key=self._ref_key(ref_path)) + raise RefAlreadyExists(f"Ref {ref_path} already exists") + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code not in ("NoSuchKey", "404"): + raise + + self.__put(self._ref_key(ref_path), data, ContentType="application/json") + + def _remote_put_ref_if_match(self, ref_path: str, data: bytes, etag: str) -> None: + try: + self.__put(self._ref_key(ref_path), data, ContentType="application/json", IfMatch=etag) + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("PreconditionFailed", "412"): + raise RefUpdateConflict(f"Ref {ref_path} changed during update") from None + raise + + def _remote_put_ref_if_absent(self, ref_path: str, data: bytes) -> None: + try: + self.__put(self._ref_key(ref_path), data, ContentType="application/json", IfNoneMatch="*") + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("PreconditionFailed", "412"): + raise RefAlreadyExists(f"Ref {ref_path} already exists") from None + raise + + def _put_json_key_if_absent(self, key: str, value: dict[str, Any]) -> bool: + data = json.dumps(value, separators=(",", ":"), sort_keys=True).encode("utf-8") + try: + self.__put(key, data, ContentType="application/json", IfNoneMatch="*") + return True + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("PreconditionFailed", "412"): + return False + raise + + def _get_json_key_with_etag(self, key: str) -> tuple[dict[str, Any] | None, str | None]: + try: + response = self.client.get_object(Bucket=self.bucket, Key=key) + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("NoSuchKey", "404"): + return None, None + raise + return json.loads(response["Body"].read()), response.get("ETag") + + def _put_json_key_if_match(self, key: str, value: dict[str, Any], etag: str) -> bool: + data = json.dumps(value, separators=(",", ":"), sort_keys=True).encode("utf-8") + try: + self.__put(key, data, ContentType="application/json", IfMatch=etag) + return True + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("PreconditionFailed", "412"): + return False + raise + + def _delete_key_if_match(self, key: str, etag: str) -> bool: + try: + self.client.delete_object(Bucket=self.bucket, Key=key, IfMatch=etag) + return True + except self.client.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("PreconditionFailed", "412", "NoSuchKey", "404"): + return False + raise + + def _remote_put_dag_ref(self, dag_id: str, data: bytes) -> None: + dag_id = self._validate_dag_id(dag_id) + ref_path = self._dag_ref_path(dag_id) + try: + self.client.head_object(Bucket=self.bucket, Key=self._dag_ref_key(dag_id)) + raise RefAlreadyExists(f"Ref {ref_path} already exists") + except self.client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code not in ("NoSuchKey", "404"): + raise + + self.__put(self._dag_ref_key(dag_id), data, ContentType="application/json") + + def _remote_delete_ref(self, ref_path: str) -> None: + """Delete ref from remote storage. + + Parameters + ---------- + ref_path : str + Reference path + """ + self.client.delete_object(Bucket=self.bucket, Key=self._ref_key(ref_path)) + + def _local_put_head( + self, + txn_or_remote_name, + remote_name_or_ref_path: str, + ref_path_or_commit_id: str, + commit_id: str | None = None, + ) -> None: + if commit_id is None: + remote_name = txn_or_remote_name + ref_path = remote_name_or_ref_path + commit_id = ref_path_or_commit_id + else: + remote_name = remote_name_or_ref_path + ref_path = ref_path_or_commit_id + branch = f"{remote_name}/{ref_path}" + self._local_put_tracking_branch(branch, Ref(f"commit:{commit_id}")) + + def _local_put_tracking_head(self, uri: str, commit_id: str) -> None: + canonical = self.canonical_dml_uri(uri, require_identifier=True) + self._local_put_tracking_branch(canonical, Ref(f"commit:{commit_id}")) + + def _local_put_tracking_branch(self, branch: str, commit_ref: Ref) -> None: + head_ops = HeadOps(_db=self._db) + try: + current = head_ops.get_branch_commit(branch) + head_ops.update_branch_commit(branch, current, commit_ref) + return + except DmlRepoError: + pass + head_ops.create_branch(branch, commit_ref) + + def _resolve_branch_push_target(self, branch: str) -> tuple[Ref, str]: + commit_ref = HeadOps(_db=self._db).get_branch_commit(branch) + return commit_ref, f"tags/{branch}/{commit_ref.id()}.json" + + def _ref_payload(self, manifest_id: str, targets: dict[str, list[str]], meta: dict | None = None) -> bytes: + ref_obj = { + "kind": "ref", + "schema": 0, + "target": self._validate_manifest_oid(manifest_id), + "created_at": int(time.time()), + "targets": self._validate_targets(targets), + "meta": meta or {}, + } + return json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + def _validate_project_ref_target(self, manifest_id: str, targets: dict[str, list[str]]) -> None: + manifest = self._decode_manifest(self._remote_get_cas(manifest_id)) + if manifest.get("root-ns") != "commit": + raise InvalidManifest("Project refs must point to commit manifests") + expected_targets = {"dag": sorted(set(manifest.get("closure", {}).get("dag", [])))} + if self._validate_targets(targets) != expected_targets: + raise InvalidRef(f"Project ref targets mismatch: expected {expected_targets}, got {targets}") + + def get_project_branch_ref(self, owner: str, project: str, branch: str) -> RemoteRefRead: + return self._remote_get_ref_with_etag(self._project_branch_ref_path(owner, project, branch)) + + def put_project_branch_ref( + self, + owner: str, + project: str, + branch: str, + manifest_id: str, + *, + targets: dict[str, list[str]], + etag: str | None, + create: bool = False, + ) -> str: + ref_path = self._project_branch_ref_path(owner, project, branch) + self._validate_project_ref_target(manifest_id, targets) + ref_bytes = self._ref_payload(manifest_id, targets) + if create: + self._remote_put_ref_if_absent(ref_path, ref_bytes) + elif etag is not None: + self._remote_put_ref_if_match(ref_path, ref_bytes, etag) + else: + self._remote_put_ref(ref_path, ref_bytes) + return ref_path + + def put_project_tag_ref( + self, + owner: str, + project: str, + tag: str, + manifest_id: str, + *, + targets: dict[str, list[str]], + ) -> str: + ref_path = self._project_tag_ref_path(owner, project, tag) + self._validate_project_ref_target(manifest_id, targets) + self._remote_put_ref_if_absent(ref_path, self._ref_payload(manifest_id, targets)) + return ref_path + + def _decode_ref(self, data: bytes) -> dict: + """Decode and validate ref data from bytes. + + Parameters + ---------- + data : bytes + JSON-encoded ref data + + Returns + ------- + dict + Decoded and validated ref object + + Raises + ------ + InvalidRef + If the data is not a valid ref + """ + o = json.loads(data) + if o.get("kind") != "ref": + raise InvalidRef("Invalid ref: kind must be 'ref'") + if o.get("schema") != 0: + raise InvalidRef("Invalid ref: schema must be 0") + target = o.get("target") + if not isinstance(target, str) or not re.match(r"^[0-9a-f]{64}$", target): + raise InvalidRef("Invalid ref: target must be 64 lowercase hex characters") + created_at = o.get("created_at") + if not isinstance(created_at, int): + raise InvalidRef("Invalid ref: created_at must be an integer") + targets = o.get("targets") + if targets is not None: + if not isinstance(targets, dict): + raise InvalidRef("Invalid ref: targets must be an object") + if set(targets) != {"dag"}: + raise InvalidRef("Invalid ref: targets supports only the 'dag' namespace") + dag_targets = targets["dag"] + if not isinstance(dag_targets, list): + raise InvalidRef("Invalid ref: targets.dag must be a sorted unique list of 64 lowercase hex ids") + if dag_targets != sorted(dag_targets) or len(dag_targets) != len(set(dag_targets)): + raise InvalidRef("Invalid ref: targets.dag must be a sorted unique list of 64 lowercase hex ids") + for dag_id in dag_targets: + if not isinstance(dag_id, str) or not re.match(r"^[0-9a-f]{64}$", dag_id): + raise InvalidRef("Invalid ref: targets.dag must be a sorted unique list of 64 lowercase hex ids") + return o + + def _decode_manifest(self, data: bytes) -> dict: + """Decode and validate manifest data from bytes. + + Parameters + ---------- + data : bytes + JSON-encoded manifest data + + Returns + ------- + dict + Decoded and validated manifest object + + Raises + ------ + InvalidManifest + If the data is not a valid manifest + """ + o = json.loads(data) + if o.get("kind") != "manifest": + raise InvalidManifest("Invalid manifest: kind must be 'manifest'") + if o.get("schema") != 0: + raise InvalidManifest("Invalid manifest: schema must be 0") + if "root-ns" not in o or "root-id" not in o: + raise InvalidManifest("Invalid manifest: must have 'root-ns' and 'root-id'") + closure = o.get("closure") + if not isinstance(closure, dict): + raise InvalidManifest("Invalid manifest: 'closure' must be a dict") + for kind, ids in closure.items(): + if not isinstance(ids, list): + raise InvalidManifest(f"Invalid manifest: closure['{kind}'] must be a list") + if ids != sorted(ids): + raise InvalidManifest(f"Invalid manifest: closure['{kind}'] must be sorted") + if len(ids) != len(set(ids)): + raise InvalidManifest(f"Invalid manifest: closure['{kind}'] must have no duplicates") + for oid in ids: + if not isinstance(oid, str) or not re.match(r"^[0-9a-f]{64}$", oid): + raise InvalidManifest(f"Invalid manifest: oid '{oid}' must be 64 lowercase hex characters") + return o + + def _closure_union(self, closure: dict[str, list[str]]) -> set[str]: + """Compute the union of all OIDs across all closure kinds. + + Parameters + ---------- + closure : dict[str, list[str]] + Closure mapping from kind to list of OIDs + + Returns + ------- + set[str] + Set of all unique OIDs across all kinds + """ + union_oids = set() + for oids in closure.values(): + union_oids.update(oids) + return union_oids + + def _local_dump_dict(self, txn, root_ref) -> dict: + closure: dict[str, dict[str, str]] = {} + visited: set[Ref] = set() + to_visit = [root_ref] + + while to_visit: + ref = to_visit.pop() + if ref in visited: + continue + visited.add(ref) + closure.setdefault(ref.ns(), {})[ref.id()] = txn.txn.get(ref, raw=True) + obj = txn.get(ref) + self._collect_local_manifest_refs(obj, root_ref=root_ref, to_visit=to_visit, visited=visited) + + return { + "kind": "local-manifest", + "schema": 0, + "root-ns": root_ref.ns(), + "root-id": root_ref.id(), + "closure": closure, + } + + def _collect_local_manifest_refs(self, obj: Any, *, root_ref: Ref, to_visit: list[Ref], visited: set[Ref]) -> None: + if isinstance(obj, Ref): + if obj.ns() == "dag" and obj != root_ref: + return + if obj not in visited: + to_visit.append(obj) + return + if isinstance(obj, dict): + for value in obj.values(): + self._collect_local_manifest_refs(value, root_ref=root_ref, to_visit=to_visit, visited=visited) + return + if isinstance(obj, list): + for value in obj: + self._collect_local_manifest_refs(value, root_ref=root_ref, to_visit=to_visit, visited=visited) + return + if is_dataclass(obj): + for field_def in fields(obj): + self._collect_local_manifest_refs( + getattr(obj, field_def.name), root_ref=root_ref, to_visit=to_visit, visited=visited + ) + + def _local_has(self, txn, ns: str, id: str) -> bool: + """Check if a local object exists in the given namespace. + + Parameters + ---------- + txn : TxnContext + Transaction context + ns : str + Namespace + id : str + Object ID + + Returns + ------- + bool + True if object exists, False otherwise + """ + try: + txn.get(Ref(f"{ns}:{id}")) + return True + except DmlRepoError: + return False + + def _build_remote_manifest( + self, local_manifest: dict, *, require_commit_root: bool = True, direct_dag_ids: list[str] | None = None + ) -> tuple[dict, bytes]: + """Build remote manifest dict and canonical bytes from local manifest. + + Parameters + ---------- + local_manifest : dict + Local manifest dictionary with closure as {ns: {id: dump_str}} + + Returns + ------- + tuple[dict, bytes] + Remote manifest dict and canonical JSON bytes + + Raises + ------ + ValueError + If root-ns is not "commit" + """ + # Validate root namespace when requested (push requirement) + root_ns = local_manifest["root-ns"] + if require_commit_root and root_ns != "commit": + raise ValueError(f"Cannot push non-commit root namespace: {root_ns!r}") + + root_id = local_manifest["root-id"] + + # Convert closure: {ns: {id: dump_str}} -> {ns: sorted([id...])} + remote_closure = {} + for ns, items in local_manifest["closure"].items(): + # Extract IDs, dedupe, and sort + ids = list(set(items.keys())) + ids.sort() + remote_closure[ns] = ids + if direct_dag_ids is not None: + if direct_dag_ids: + remote_closure["dag"] = sorted(set(direct_dag_ids)) + else: + remote_closure.pop("dag", None) + + # Build remote manifest dict + manifest_dict = { + "kind": "manifest", + "schema": 0, + "root-ns": root_ns, + "root-id": root_id, + "closure": remote_closure, + } + + # Produce canonical bytes + manifest_bytes = json.dumps(manifest_dict, separators=(",", ":"), sort_keys=True).encode("utf-8") + + return manifest_dict, manifest_bytes + + def _validate_targets(self, targets: dict[str, list[str]]) -> dict[str, list[str]]: + if not isinstance(targets, dict): + raise ValueError("Invalid targets: expected {'dag': [...]} mapping") + if set(targets) != {"dag"}: + raise ValueError("Invalid targets: expected only the 'dag' namespace") + dag_ids = targets["dag"] + if not isinstance(dag_ids, list): + raise ValueError("Invalid targets: dag targets must be a sorted unique list of 64 lowercase hex ids") + validated = [self._validate_dag_id(dag_id) for dag_id in dag_ids] + if validated != sorted(validated) or len(validated) != len(set(validated)): + raise ValueError("Invalid targets: dag targets must be a sorted unique list of 64 lowercase hex ids") + return {"dag": validated} + + def _collect_direct_dag_ids_from_obj( + self, + obj: Any, + *, + root_ref: Ref, + to_visit: list[Ref], + visited: set[Ref], + dag_ids: set[str], + ) -> None: + if isinstance(obj, Ref): + if obj.ns() == "dag" and obj != root_ref: + dag_ids.add(obj.id()) + return + if obj not in visited: + to_visit.append(obj) + return + if isinstance(obj, dict): + for value in obj.values(): + self._collect_direct_dag_ids_from_obj( + value, root_ref=root_ref, to_visit=to_visit, visited=visited, dag_ids=dag_ids + ) + return + if isinstance(obj, list): + for value in obj: + self._collect_direct_dag_ids_from_obj( + value, root_ref=root_ref, to_visit=to_visit, visited=visited, dag_ids=dag_ids + ) + return + if is_dataclass(obj): + for field_def in fields(obj): + self._collect_direct_dag_ids_from_obj( + getattr(obj, field_def.name), root_ref=root_ref, to_visit=to_visit, visited=visited, dag_ids=dag_ids + ) + + def _direct_dag_ids(self, txn, root_ref: Ref) -> list[str]: + if root_ref.ns() == "commit": + commit: Commit = txn.get(root_ref) + tree: Tree = txn.get(commit.tree) + return sorted({dag_ref.id() for dag_ref in tree.dags.values()}) + + dag_ids: set[str] = set() + visited: set[Ref] = set() + to_visit: list[Ref] = [root_ref] + + while to_visit: + ref = to_visit.pop() + if ref in visited: + continue + visited.add(ref) + obj = txn.get(ref) + self._collect_direct_dag_ids_from_obj( + obj, root_ref=root_ref, to_visit=to_visit, visited=visited, dag_ids=dag_ids + ) + + return sorted(dag_ids) + + def _targets_for_root(self, txn, root_ref: Ref) -> dict[str, list[str]]: + return {"dag": self._direct_dag_ids(txn, root_ref)} + + def _require_manifest_ref_targets(self, ref_obj: dict, ref_path: str) -> dict[str, list[str]]: + targets = ref_obj.get("targets") + if targets is None: + raise InvalidRef(f"Invalid ref: manifest ref {ref_path} must include targets") + return self._validate_targets(targets) + + def _put_ref_manifest_from_local_manifest(self, local_manifest: dict, root_ref: Ref, txn) -> str: + direct_dag_ids = self._direct_dag_ids(txn, root_ref) + for dag_id in direct_dag_ids: + self._ensure_dag_ref_in_txn(Ref(f"dag:{dag_id}"), txn, ()) + + self._push_upload_objects(local_manifest) + _manifest_dict, manifest_bytes = self._build_remote_manifest( + local_manifest, require_commit_root=False, direct_dag_ids=direct_dag_ids + ) + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + if not self._remote_has_cas(manifest_id): + self._remote_put_cas(manifest_id, manifest_bytes) + # If the root ref is a dag, also write its dag ref pointer file so that + # _load_remote_dag can resolve it by dag_id. + if root_ref.ns() == "dag": + dag_id = self._validate_dag_id(root_ref.id()) + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": int(time.time()), + "meta": {"dag": {"id": dag_id}}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + try: + self._remote_put_dag_ref(dag_id, ref_bytes) + except RefAlreadyExists: + pass + return manifest_id + + def _ensure_dag_ref_in_txn(self, dag_ref: Ref, txn, stack: tuple[str, ...]) -> bool: + dag_id = self._validate_dag_id(dag_ref.id()) + if dag_id in stack: + cycle = " -> ".join([*stack, dag_id]) + raise DmlRepoError(f"Cycle detected in DAG closure: {cycle}") + + try: + self._remote_get_dag_ref(dag_id) + return True + except RemoteError: + pass + + local_manifest = self._local_dump_dict(txn, dag_ref) + if local_manifest.get("root-ns") != "dag": + raise ValueError(f"Expected local dag manifest root namespace 'dag', got {local_manifest.get('root-ns')!r}") + + next_stack = (*stack, dag_id) + for child_dag_id in self._direct_dag_ids(txn, dag_ref): + self._ensure_dag_ref_in_txn(Ref(f"dag:{child_dag_id}"), txn, next_stack) + + self._push_upload_objects(local_manifest) + _manifest_dict, manifest_bytes = self._build_remote_manifest( + local_manifest, require_commit_root=False, direct_dag_ids=self._direct_dag_ids(txn, dag_ref) + ) + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + if not self._remote_has_cas(manifest_oid): + self._remote_put_cas(manifest_oid, manifest_bytes) + + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_oid, + "created_at": int(time.time()), + "meta": {"dag": {"id": dag_id}}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + try: + self._remote_put_dag_ref(dag_id, ref_bytes) + except RefAlreadyExists: + self._decode_ref(self._remote_get_dag_ref(dag_id)) + return True + return True + + @_remote_boundary("manifest upload") + def put_ref_manifest(self, root_ref: Ref) -> str: + with self._tx(readonly=False) as txn: + local_manifest = self._local_dump_dict(txn, root_ref) + return self._put_ref_manifest_from_local_manifest(local_manifest, root_ref, txn) + + @_remote_boundary("manifest load") + def load_ptr(self, manifest_oid: str, *, expected_root_ns: str | None = None) -> Ref: + """Resolve a manifest OID, materialize closure locally, and return root ref.""" + with self._tx(readonly=False) as txn: + return self.load_ptr_in_txn(manifest_oid, txn, expected_root_ns=expected_root_ns) + + def _fetch_manifest_result(self, manifest_oid: str) -> _ManifestFetchResult: + manifest_oid = self._validate_manifest_oid(manifest_oid) + manifest_bytes = self._remote_get_cas(manifest_oid) + return _ManifestFetchResult(manifest_oid, self._decode_manifest(manifest_bytes)) + + def _fetch_dag_ref_result(self, dag_id: str) -> _DagRefFetchResult: + dag_ref = self._decode_ref(self._remote_get_dag_ref(dag_id)) + return _DagRefFetchResult(dag_id, dag_ref["target"]) + + def _fetch_cas_result(self, ns: str, oid: str) -> _CasFetchResult: + raw_bytes = self._remote_get_cas(oid) + computed_hash = hashlib.sha256(raw_bytes).hexdigest() + if computed_hash != oid: + raise ShaMismatch(f"SHA256 mismatch for object {oid}: expected {oid}, got {computed_hash}") + return _CasFetchResult(ns, oid, raw_bytes) + + def _put_local_cas_object(self, txn, ns: str, oid: str, raw_bytes: bytes) -> Ref: + dump_str = base64.b64encode(raw_bytes).decode("ascii") + return txn.txn.put(dump_str, ns=ns, raw=True) + + def load_ptr_in_txn(self, manifest_oid: str, txn, *, expected_root_ns: str | None = None) -> Ref: + """Resolve a manifest OID and materialize closure using a provided transaction.""" + seen_manifests: set[str] = set() + seen_dag_refs: set[str] = set() + seen_objects: set[tuple[str, str]] = set() + pending = set() + root_ref: Ref | None = None + + def submit_manifest(pool, next_manifest_oid: str) -> None: + next_manifest_oid = self._validate_manifest_oid(next_manifest_oid) + if next_manifest_oid in seen_manifests: + return + seen_manifests.add(next_manifest_oid) + pending.add(pool.submit(self._fetch_manifest_result, next_manifest_oid)) + + def submit_dag_ref(pool, dag_id: str) -> None: + if dag_id in seen_dag_refs: + return + seen_dag_refs.add(dag_id) + pending.add(pool.submit(self._fetch_dag_ref_result, dag_id)) + + def submit_object(pool, ns: str, oid: str) -> None: + key = (ns, oid) + if key in seen_objects: + return + if self._local_has(txn, ns, oid): + seen_objects.add(key) + return + seen_objects.add(key) + pending.add(pool.submit(self._fetch_cas_result, ns, oid)) + + with ThreadPoolExecutor(max_workers=self.fetch_workers) as pool: + submit_manifest(pool, manifest_oid) + + while pending: + done, pending = wait(pending, return_when=FIRST_COMPLETED) + for fut in done: + result = fut.result() + + if isinstance(result, _ManifestFetchResult): + manifest = result.manifest + current_root_ref = Ref(f"{manifest['root-ns']}:{manifest['root-id']}") + if root_ref is None: + root_ref = current_root_ref + if expected_root_ns is not None and root_ref.ns() != expected_root_ns: + raise ValueError( + "Manifest root namespace mismatch: " + f"expected {expected_root_ns!r}, got {root_ref.ns()!r}" + ) + if manifest["root-ns"] == "dag": + submit_object(pool, "dag", manifest["root-id"]) + for ns, ids in manifest["closure"].items(): + if ns == "dag": + for dag_id in ids: + submit_dag_ref(pool, dag_id) + else: + for oid in ids: + submit_object(pool, ns, oid) + continue + + if isinstance(result, _DagRefFetchResult): + submit_manifest(pool, result.manifest_oid) + continue + + if isinstance(result, _CasFetchResult): + inserted_ref = self._put_local_cas_object(txn, result.ns, result.oid, result.raw_bytes) + if inserted_ref.ns() != result.ns or inserted_ref.id() != result.oid: + raise DmlRepoError( + f"Loaded object mismatch: expected {result.ns}:{result.oid}, got {inserted_ref}" + ) + continue + + raise AssertionError(f"Unhandled remote load result: {type(result)!r}") + + if root_ref is None: + raise DmlRepoError("Remote manifest load produced no root") + if not txn.exists(root_ref): + raise DmlRepoError(f"Remote manifest load did not materialize root object: {root_ref}") + return root_ref + + @_remote_boundary("cache get") + def get_cache_ref(self, cache_key: str) -> str | None: + """Read cache ref target manifest OID for a cache key.""" + ref_path = self._cache_ref_path(cache_key) + try: + ref_bytes = self._remote_get_ref(ref_path) + except RemoteError: + return None + ref_obj = self._decode_ref(ref_bytes) + self._require_manifest_ref_targets(ref_obj, ref_path) + return ref_obj["target"] + + @_remote_boundary("cache read") + def get_cache_ref_info(self, cache_key: str) -> dict[str, Any] | None: + ref_path = self._cache_ref_path(cache_key) + try: + ref_bytes = self._remote_get_ref(ref_path) + except RemoteError: + return None + ref_obj = self._decode_ref(ref_bytes) + self._require_manifest_ref_targets(ref_obj, ref_path) + execution_id = ref_obj.get("execution_id") + if not isinstance(execution_id, str) or not execution_id: + raise DmlRepoError(f"Cache ref missing execution_id: {ref_path}") + ref_obj["execution_id"] = execution_id + return ref_obj + + @_remote_boundary("cache put") + def put_cache_ref( + self, + cache_key: str, + target: str, + *, + targets: dict[str, list[str]], + execution_id: str, + ) -> None: + """Create a cache ref only when no current ref exists.""" + target = self._validate_manifest_oid(target) + targets = self._validate_targets(targets) + if not isinstance(execution_id, str) or not execution_id: + raise ValueError("execution_id must be a non-empty string") + ref_path = self._cache_ref_path(cache_key) + ref_obj = { + "kind": "ref", + "schema": 0, + "target": target, + "execution_id": execution_id, + "created_at": int(time.time()), + "targets": targets, + "meta": {}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + self._remote_put_ref_if_absent(ref_path, ref_bytes) + + @_remote_boundary("cache delete") + def delete_cache_ref(self, cache_key: str) -> bool: + """Delete cache ref by cache key.""" + ref_path = self._cache_ref_path(cache_key) + try: + self._remote_get_ref(ref_path) + except RemoteError: + return False + self._remote_delete_ref(ref_path) + return True + + @_remote_boundary("cache compare-and-delete") + def delete_cache_ref_if_execution_id(self, cache_key: str, execution_id: str) -> bool: + ref_path = self._cache_ref_path(cache_key) + try: + observed = self._remote_get_ref_with_etag(ref_path) + except RemoteError: + return False + if observed.ref.get("execution_id") != execution_id or not observed.etag: + return False + return self._delete_key_if_match(self._ref_key(ref_path), observed.etag) + + @_remote_boundary("cache list") + def list_cache_refs(self, limit: int | None = None) -> list[tuple[str, str]]: + """List cache refs as (cache_key, target_oid) pairs.""" + refs = self.list("cache") + out: list[tuple[str, str]] = [] + for ref_obj in refs: + ref_path = ref_obj["ref_path"] + filename = ref_path.split("/")[-1] + if not filename.endswith(".json"): + continue + cache_key = filename[: -len(".json")] + out.append((cache_key, ref_obj["target"])) + if limit is not None and len(out) >= limit: + break + return out + + def _execution_state_key(self, execution_id: str) -> str: + return self._prefixed_key(f"exec/state/{execution_id}.json") + + def _execution_invalidate_key(self, execution_id: str) -> str: + return self._prefixed_key(f"exec/invalidate/{execution_id}.json") + + def _list_json_prefix(self, relative_prefix: str) -> list[tuple[str, dict[str, Any]]]: + prefix = self._prefixed_key(relative_prefix) + out: list[tuple[str, dict[str, Any]]] = [] + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + if not key.endswith(".json"): + continue + body, _etag = self._get_json_key_with_etag(key) + if body is not None: + out.append((key, body)) + return out + + def _ingest_execution_graph(self) -> sqlite3.Connection: + conn = sqlite3.connect(":memory:") + conn.executescript( + """ + create table states ( + execution_id text primary key, + cache_key text not null, + lifecycle text not null, + updated_at integer not null, + cancellation_requested_by text, + spawned_execution_ids_json text not null + ); + create table edges ( + callee_execution_id text not null, + caller_execution_id text not null, + primary key (callee_execution_id, caller_execution_id) + ); + create table invalidations ( + execution_id text primary key, + cache_key text not null, + requested_by text not null, + requested_at integer not null + ); + create table cache_refs ( + cache_key text primary key, + execution_id text not null, + target text not null, + created_at integer not null + ); + """ + ) + for _key, state in self._list_json_prefix("exec/state/"): + conn.execute( + "insert or replace into states values (?, ?, ?, ?, ?, ?)", + ( + state["execution_id"], + state["cache_key"], + state["lifecycle"], + state["updated_at"], + state.get("cancellation_requested_by"), + json.dumps(state.get("spawned_execution_ids", []), separators=(",", ":"), sort_keys=True), + ), + ) + for _key, edge in self._list_json_prefix("exec/edges/"): + conn.execute( + "insert or replace into edges values (?, ?)", + (edge["callee_execution_id"], edge["caller_execution_id"]), + ) + for _key, invalidation in self._list_json_prefix("exec/invalidate/"): + conn.execute( + "insert or replace into invalidations values (?, ?, ?, ?)", + ( + invalidation["execution_id"], + invalidation["cache_key"], + invalidation["requested_by"], + invalidation["requested_at"], + ), + ) + for ref in self.list("cache"): + ref_path = ref["ref_path"] + cache_key = ref_path.split("/")[-1][: -len(".json")] + conn.execute( + "insert or replace into cache_refs values (?, ?, ?, ?)", + (cache_key, ref.get("execution_id"), ref["target"], ref["created_at"]), + ) + conn.commit() + return conn + + def _update_execution_state_record( + self, + execution_id: str, + *, + lifecycle: str | None = None, + spawned_execution_ids: list[str] | None = None, + cancellation_requested_by: str | None = None, + retries: int = 8, + ) -> dict[str, Any] | None: + key = self._execution_state_key(execution_id) + lifecycle_rank = {"running": 0, "cancel-pending": 1, "cancel-detached": 2, "succeeded": 3, "failed": 3} + for _ in range(retries): + current, etag = self._get_json_key_with_etag(key) + if current is None or etag is None: + return None + merged = dict(current) + if lifecycle is not None and lifecycle_rank[lifecycle] > lifecycle_rank[merged["lifecycle"]]: + merged["lifecycle"] = lifecycle + if spawned_execution_ids: + merged["spawned_execution_ids"] = sorted( + {*merged.get("spawned_execution_ids", []), *spawned_execution_ids} + ) + if cancellation_requested_by is not None and merged.get("cancellation_requested_by") is None: + merged["cancellation_requested_by"] = cancellation_requested_by + merged["updated_at"] = int(time.time()) + if self._put_json_key_if_match(key, merged, etag): + return merged + raise DmlRepoError(f"Remote execution state update failed for {execution_id}") + + @_remote_boundary("invalidate cache") + def invalidate_cache(self, cache_keys: list[str], *, requested_by: str) -> dict[str, Any]: + conn = self._ingest_execution_graph() + seen: list[str] = [] + seen_set: set[str] = set() + unseen: set[str] = set() + for cache_key in cache_keys: + row = conn.execute( + "select execution_id from cache_refs where cache_key = ?", + (cache_key,), + ).fetchone() + if row and row[0]: + unseen.add(cast(str, row[0])) + while unseen: + exec_id = unseen.pop() + state_row = conn.execute( + "select cache_key from states where execution_id = ?", + (exec_id,), + ).fetchone() + if state_row is None: + continue + cache_row = conn.execute( + "select execution_id from cache_refs where cache_key = ?", + (state_row[0],), + ).fetchone() + if cache_row is None or cache_row[0] != exec_id: + continue + if exec_id in seen_set: + continue + seen.append(exec_id) + seen_set.add(exec_id) + for caller_row in conn.execute( + "select caller_execution_id from edges where callee_execution_id = ?", + (exec_id,), + ): + if caller_row[0] not in seen_set: + unseen.add(caller_row[0]) + requested_at = int(time.time()) + committed: list[str] = [] + for exec_id in reversed(seen): + row = conn.execute( + "select cache_key from states where execution_id = ?", + (exec_id,), + ).fetchone() + if row is None: + continue + tombstone = { + "execution_id": exec_id, + "cache_key": row[0], + "requested_by": requested_by, + "requested_at": requested_at, + } + self._put_json_key_if_absent(self._execution_invalidate_key(exec_id), tombstone) + self.delete_cache_ref_if_execution_id(cast(str, row[0]), exec_id) + committed.append(exec_id) + return {"requested": cache_keys, "invalidated_execution_ids": committed} + + @_remote_boundary("cancel executions") + def cancel_executions(self, execution_ids: list[str], *, requested_by: str) -> dict[str, Any]: + conn = self._ingest_execution_graph() + seen: list[str] = [] + seen_set: set[str] = set() + unseen = set(execution_ids) + terminal = {"succeeded", "failed", "cancel-detached"} + inactive = terminal | {"cancel-pending"} + while unseen: + exec_id = unseen.pop() + row = conn.execute( + "select lifecycle, spawned_execution_ids_json from states where execution_id = ?", + (exec_id,), + ).fetchone() + if row is None or row[0] in terminal or exec_id in seen_set: + continue + seen.append(exec_id) + seen_set.add(exec_id) + unseen.update(dep for dep in json.loads(row[1]) if dep not in seen_set) + committed: list[str] = [] + for exec_id in reversed(seen): + row = conn.execute( + "select lifecycle from states where execution_id = ?", + (exec_id,), + ).fetchone() + if row is None or row[0] in terminal: + continue + caller_count = 0 + for caller_row in conn.execute( + "select caller_execution_id from edges where callee_execution_id = ?", + (exec_id,), + ): + caller_state = conn.execute( + "select lifecycle from states where execution_id = ?", + (caller_row[0],), + ).fetchone() + if caller_state is not None and caller_state[0] not in inactive: + caller_count += 1 + if caller_count > 1: + continue + updated = self._update_execution_state_record( + exec_id, + lifecycle="cancel-pending", + cancellation_requested_by=requested_by, + ) + if updated is not None: + conn.execute( + "update states set lifecycle = ?, cancellation_requested_by = ? where execution_id = ?", + (updated["lifecycle"], updated.get("cancellation_requested_by"), exec_id), + ) + committed.append(exec_id) + conn.commit() + return {"requested": execution_ids, "cancel_pending_execution_ids": committed} + + def _push_upload_objects(self, local_manifest: dict) -> None: + """Upload missing CAS objects from local manifest closure. + + Iterates through all objects in the local manifest's closure, + verifies SHA256 integrity, and uploads to remote storage if missing. + + Parameters + ---------- + local_manifest : dict + Local manifest dictionary with closure containing base64-encoded objects + + Raises + ------ + ValueError + If any object's SHA256 hash doesn't match its ID + """ + closure = local_manifest.get("closure", {}) + + for _ns, items in closure.items(): + for id_, dump_str in items.items(): + # Decode base64 to raw bytes + raw = base64.b64decode(dump_str) + + # Verify SHA256 matches the ID + computed_hash = hashlib.sha256(raw).hexdigest() + if computed_hash != id_: + raise ShaMismatch(f"SHA256 mismatch for object {id_}: expected {id_}, got {computed_hash}") + + # Upload only if missing + if not self._remote_has_cas(id_): + self._remote_put_cas(id_, raw) + + @_remote_boundary("push") + def push(self, branch: str) -> str: + """Push a branch to remote storage. + + Parameters + ---------- + branch : str + Branch name to publish. + + Returns + ------- + str + The ref path where the reference was published + + Raises + ------ + RefAlreadyExists + If the ref already exists remotely + """ + root_ref, ref_path = self._resolve_branch_push_target(branch) + + with self._tx(readonly=False) as txn: + lm = self._local_dump_dict(txn, root_ref) + targets = self._targets_for_root(txn, root_ref) + manifest_dict, _manifest_bytes = self._build_remote_manifest( + lm, require_commit_root=True, direct_dag_ids=targets["dag"] + ) + expected_targets = {"dag": sorted(set(manifest_dict["closure"].get("dag", [])))} + if targets != expected_targets: + raise ValueError(f"Manifest targets mismatch: expected {expected_targets}, got {targets}") + manifest_id = self._put_ref_manifest_from_local_manifest(lm, root_ref, txn) + + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": int(time.time()), + "targets": targets, + "meta": {}, # Optional metadata + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + self._remote_put_ref(ref_path, ref_bytes) + return ref_path + + @_remote_boundary("pull") + def pull(self, ref_path: str) -> None: + """Pull a repository reference from remote storage. + + Parameters + ---------- + ref_path : str + Reference path to pull (e.g., "tags/main/v1.json") + + Raises + ------ + ValueError + If the manifest has a non-commit root namespace + RemoteError + If the ref or manifest cannot be found + """ + # Step 1: Get ref bytes + ref_bytes = self._remote_get_ref(ref_path) + + # Step 2: Decode ref + ref_obj = self._decode_ref(ref_bytes) + self._require_manifest_ref_targets(ref_obj, ref_path) + + # Step 3-7: Materialize pointed commit manifest and write pulled head pointer + remote_name = f"s3://{self.bucket}" + if self.prefix: + remote_name = f"s3://{self.bucket}/{self.prefix}" + + with self._tx(readonly=False) as txn: + root_ref = self.load_ptr_in_txn(ref_obj["target"], txn, expected_root_ns="commit") + self._local_put_head(remote_name, ref_path, root_ref.id()) + + @_remote_boundary("fetch") + def fetch_uri(self, uri: str) -> Ref: + canonical = self.canonical_dml_uri(uri, require_identifier=True) + ref_path = self._dml_uri_ref_path(canonical) + ref_obj = self._decode_ref(self._remote_get_ref(ref_path)) + self._require_manifest_ref_targets(ref_obj, ref_path) + with self._tx(readonly=False) as txn: + root_ref = self.load_ptr_in_txn(ref_obj["target"], txn, expected_root_ns="commit") + self._local_put_tracking_head(canonical, root_ref.id()) + return root_ref + + @_remote_boundary("push branch") + def push_project_branch(self, uri: str, branch: str, *, create: bool = False, force: bool = False) -> str: + parsed = self.parse_dml_uri(uri, require_identifier=True) + if parsed.branch is None: + raise ValueError("Project branch push requires a branch URI") + ref_path = self._project_branch_ref_path(parsed.owner, parsed.project, parsed.branch) + root_ref, _ref_path = self._resolve_branch_push_target(branch) + + observed: RemoteRefRead | None = None + try: + observed = self._remote_get_ref_with_etag(ref_path) + except RemoteError: + if not create: + raise DmlRepoError( + f"Remote branch ref '{ref_path}' does not exist; push updates existing refs only. " + "Use --create to create a new remote branch ref." + ) from None + + with self._tx(readonly=True) as txn: + lm = self._local_dump_dict(txn, root_ref) + targets = self._targets_for_root(txn, root_ref) + manifest_dict, _manifest_bytes = self._build_remote_manifest( + lm, require_commit_root=True, direct_dag_ids=targets["dag"] + ) + expected_targets = {"dag": sorted(set(manifest_dict["closure"].get("dag", [])))} + if targets != expected_targets: + raise ValueError(f"Manifest targets mismatch: expected {expected_targets}, got {targets}") + if observed is not None and not force: + remote_commit = self.load_ptr_in_txn(observed.ref["target"], txn, expected_root_ns="commit") + from daggerml._internal.ops.commit import CommitOps + + if not CommitOps(_db=self._db)._is_ancestor_in_txn(remote_commit, root_ref, txn): + raise DmlRepoError("Non-fast-forward push rejected; use --force to override") + manifest_id = self._put_ref_manifest_from_local_manifest(lm, root_ref, txn) + + if observed is None: + return self.put_project_branch_ref( + parsed.owner, parsed.project, parsed.branch, manifest_id, targets=targets, etag=None, create=True + ) + return self.put_project_branch_ref( + parsed.owner, + parsed.project, + parsed.branch, + manifest_id, + targets=targets, + etag=observed.etag, + create=False, + ) + + @_remote_boundary("push tag") + def push_project_tag(self, uri: str, branch: str) -> str: + parsed = self.parse_dml_uri(uri, require_identifier=True) + if parsed.tag is None: + raise ValueError("Project tag push requires a tag URI") + root_ref, _ref_path = self._resolve_branch_push_target(branch) + with self._tx(readonly=True) as txn: + lm = self._local_dump_dict(txn, root_ref) + targets = self._targets_for_root(txn, root_ref) + manifest_dict, _manifest_bytes = self._build_remote_manifest( + lm, require_commit_root=True, direct_dag_ids=targets["dag"] + ) + expected_targets = {"dag": sorted(set(manifest_dict["closure"].get("dag", [])))} + if targets != expected_targets: + raise ValueError(f"Manifest targets mismatch: expected {expected_targets}, got {targets}") + manifest_id = self._put_ref_manifest_from_local_manifest(lm, root_ref, txn) + return self.put_project_tag_ref(parsed.owner, parsed.project, parsed.tag, manifest_id, targets=targets) + + @_remote_boundary("pull branch") + def pull_uri_into_branch(self, uri: str, branch: str, *, user: str) -> Ref: + fetched = self.fetch_uri(uri) + from daggerml._internal.ops.commit import CommitOps + + return CommitOps(_db=self._db).merge_into_head(branch, fetched, user) + + @_remote_boundary("list") + def list(self, prefix: str) -> list[dict]: + """List remote refs for a given prefix. + + Parameters + ---------- + prefix : str + The prefix to list refs for ("tags" or "cache") + + Returns + ------- + list[dict] + List of dictionaries containing decoded ref information including + meta data and inferred ref_path + """ + if prefix not in {"tags", "cache", "projects"}: + raise ValueError(f"Invalid list prefix: {prefix!r}. Expected 'tags' or 'cache' or 'projects'.") + + refs = [] + + # List objects under refs// + prefix_key = f"{self.prefix}/refs/{prefix}/" if self.prefix else f"refs/{prefix}/" + + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix_key): + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + + # Only process .json files + if not key.endswith(".json"): + continue + + # Extract ref_path relative to refs// + if self.prefix: + ref_path = key[len(f"{self.prefix}/refs/") :] + else: + ref_path = key[len("refs/") :] + + # Get and decode the ref + ref_bytes = self._remote_get_ref(ref_path) + ref_obj = self._decode_ref(ref_bytes) + self._require_manifest_ref_targets(ref_obj, ref_path) + + # Add inferred ref_path to the result + ref_obj["ref_path"] = ref_path + refs.append(ref_obj) + + return refs + + @_remote_boundary("prune") + def prune(self) -> int: + """Delete expired invoke transport blobs. + + Returns + ------- + int + Number of invoke blobs deleted + """ + now = int(time.time()) + deleted_count = 0 + + # Cleanup applies only to ephemeral invoke transport blobs. + invoke_prefix = f"{self.prefix}/io/invoke/" if self.prefix else "io/invoke/" + + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=invoke_prefix): + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + last_modified = obj["LastModified"] + age_seconds = now - int(last_modified.timestamp()) + if age_seconds < self._IO_INVOKE_PRUNE_AGE_SECONDS: + continue + self.client.delete_object(Bucket=self.bucket, Key=key) + deleted_count += 1 + + return deleted_count + + def _gc_mark(self, *, malformed: Literal["raise", "warn", "ignore"] = "warn") -> set[str]: + """Build the set of live OIDs by marking reachable objects from refs. + + Refs are the roots; manifests define reachability through their closure. + + Returns + ------- + set[str] + Set of all live OIDs (manifest targets + closure union) + """ + if malformed not in {"raise", "warn", "ignore"}: + raise ValueError(f"Invalid malformed policy: {malformed!r}") + + live_oids = set() + worklist = [] + seen_manifests = set() + + def _malformed_detail(exc: Exception) -> str: + msg = str(exc) + for prefix in ("Invalid ref: ", "Invalid manifest: "): + if msg.startswith(prefix): + return msg[len(prefix) :] + return msg + + def _handle_malformed(message: str, *, delete_ref_path: str | None = None, delete_cas_oid: str | None = None): + if malformed == "raise": + raise DmlRepoError(message) + if malformed == "warn": + self._logger.warning(message) + if delete_ref_path is not None: + _safe_delete_ref(delete_ref_path) + if delete_cas_oid is not None: + _safe_delete_cas(delete_cas_oid) + + def _safe_delete_ref(ref_path: str): + try: + self._remote_delete_ref(ref_path) + except Exception: + pass + + def _safe_delete_cas(oid: str): + try: + self.client.delete_object(Bucket=self.bucket, Key=self._cas_key(oid)) + except Exception: + pass + + def _visit_root_ref(ref_obj: dict): + manifest_oid = ref_obj["target"] + live_oids.add(manifest_oid) + worklist.append(manifest_oid) + + for prefix in ("tags", "cache", "projects"): + prefix_key = f"{self.prefix}/refs/{prefix}/" if self.prefix else f"refs/{prefix}/" + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix_key): + if "Contents" not in page: + continue + for obj in page["Contents"]: + key = obj["Key"] + if not key.endswith(".json"): + continue + ref_path = key[len(f"{self.prefix}/refs/") :] if self.prefix else key[len("refs/") :] + try: + ref_obj = self._decode_ref(self._remote_get_ref(ref_path)) + self._require_manifest_ref_targets(ref_obj, ref_path) + _visit_root_ref(ref_obj) + except InvalidRef as exc: + _handle_malformed( + f"Malformed ref refs/{ref_path}: {_malformed_detail(exc)}", delete_ref_path=ref_path + ) + except MissingCasObject: + _safe_delete_ref(ref_path) + + while worklist: + manifest_oid = worklist.pop() + if manifest_oid in seen_manifests: + continue + seen_manifests.add(manifest_oid) + try: + manifest = self._decode_manifest(self._remote_get_cas(manifest_oid)) + except InvalidManifest as exc: + _handle_malformed( + f"Malformed manifest {manifest_oid}: {_malformed_detail(exc)}", delete_cas_oid=manifest_oid + ) + continue + except MissingCasObject: + continue + + if manifest.get("root-ns") == "dag": + live_oids.add(manifest["root-id"]) + + for ns, ids in manifest["closure"].items(): + if ns == "dag": + for dag_id in ids: + try: + dag_ref = self._decode_ref(self._remote_get_dag_ref(dag_id)) + except RemoteError: + continue + except InvalidRef as exc: + _handle_malformed( + f"Malformed ref refs/{self._dag_ref_path(dag_id)}: {_malformed_detail(exc)}", + delete_ref_path=self._dag_ref_path(dag_id), + ) + continue + child_manifest_oid = dag_ref["target"] + live_oids.add(child_manifest_oid) + try: + self._remote_get_cas(child_manifest_oid) + except MissingCasObject: + _safe_delete_ref(self._dag_ref_path(dag_id)) + continue + worklist.append(child_manifest_oid) + continue + for oid in ids: + live_oids.add(oid) + try: + raw = self._remote_get_cas(oid) + except MissingCasObject: + continue + if hashlib.sha256(raw).hexdigest() != oid: + _handle_malformed( + f"Malformed CAS {oid}: sha256 mismatch for stored bytes", + delete_cas_oid=oid, + ) + + return live_oids + + def _gc_sweep(self, live_oids: set[str], min_age_seconds: int) -> dict[str, int]: + """Perform GC sweep phase: delete unreferenced CAS objects older than safety window. + + Parameters + ---------- + live_oids : set[str] + Set of live OIDs that should not be deleted + min_age_seconds : int + Minimum age in seconds for objects to be eligible for deletion + + Returns + ------- + dict[str, int] + Summary with counts: {"deleted": n, "kept_live": n, "kept_young": n} + """ + deleted = 0 + kept_live = 0 + kept_young = 0 + + # Current time for age calculation + now = int(time.time()) + + # List all CAS objects under cas/sha256/ + cas_prefix = f"{self.prefix}/cas/sha256/" if self.prefix else "cas/sha256/" + + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=cas_prefix): + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + last_modified = obj["LastModified"] + + # Extract OID from key: {prefix}/cas/sha256/{aa}/{bb}/{oid} + # The OID is the last component after the last '/' + oid = key.split("/")[-1] + + # Validate that this looks like an OID (64 hex chars) + if not re.match(r"^[0-9a-f]{64}$", oid): + raise InvalidOid(f"Invalid CAS key: expected trailing 64-char lowercase hex OID, got {key!r}") + + if oid in live_oids: + # Keep live objects + kept_live += 1 + continue + + # Check age + age_seconds = now - int(last_modified.timestamp()) + if age_seconds < min_age_seconds: + # Keep young objects + kept_young += 1 + continue + + # Delete old, unreferenced object + try: + self.client.delete_object(Bucket=self.bucket, Key=key) + deleted += 1 + except Exception: + # Skip deletion errors (object might have been deleted by another process) + pass + + return { + "deleted": deleted, + "kept_live": kept_live, + "kept_young": kept_young, + } + + @_remote_boundary("gc") + def gc( + self, min_age_seconds: int = 24 * 3600, *, malformed: Literal["raise", "warn", "ignore"] = "warn" + ) -> dict[str, int]: + """Run garbage collection on the remote storage. + + This performs mark-and-sweep GC where refs are the roots. + First prunes expired invoke transport blobs under `io/invoke/**`, then marks live objects, + then sweeps unreferenced objects older than the safety window. + + Parameters + ---------- + min_age_seconds : int, optional + Minimum age in seconds for unreferenced objects to be deleted. + Defaults to 24 hours. + malformed : {"raise", "warn", "ignore"}, optional + Handling policy for malformed refs/manifests/CAS encountered during mark. + Defaults to "warn". + + Returns + ------- + dict[str, int] + Summary with counts: {"deleted": n, "kept_live": n, "kept_young": n} + """ + # First prune expired invoke transport blobs (independent from CAS/ref reachability) + self.prune() + + # Mark phase: build set of live OIDs + live_oids = self._gc_mark(malformed=malformed) + + # Sweep phase: delete unreferenced objects older than safety window + return self._gc_sweep(live_oids, min_age_seconds) diff --git a/src/daggerml/_internal/revision_uri.py b/src/daggerml/_internal/revision_uri.py new file mode 100644 index 0000000..8c9eb20 --- /dev/null +++ b/src/daggerml/_internal/revision_uri.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from urllib.parse import urlsplit + +_SEGMENT_RE = re.compile(r"^[a-z0-9][a-z0-9._-]{0,127}$") + + +def validate_segment(label: str, value: str) -> str: + if not isinstance(value, str) or not _SEGMENT_RE.match(value): + raise ValueError(f"Invalid {label}: {value!r}") + return value + + +def validate_ref_name(label: str, value: str) -> str: + if not isinstance(value, str) or not value: + raise ValueError(f"Invalid {label}: must be a non-empty string") + if value in {".", ".."} or "\\" in value: + raise ValueError(f"Invalid {label}: {value!r}") + parts = value.split("/") + if any(part in {"", ".", ".."} for part in parts): + raise ValueError(f"Invalid {label}: {value!r}") + for part in parts: + validate_segment(f"{label} segment", part) + return value + + +@dataclass(frozen=True) +class RevisionUri: + owner: str + project: str + branch: str | None = None + tag: str | None = None + + def __post_init__(self) -> None: + validate_segment("project owner", self.owner) + validate_segment("project name", self.project) + if (self.branch is None) == (self.tag is None): + raise ValueError("Revision URI must include exactly one selector (branch xor tag)") + if self.branch is not None: + validate_ref_name("branch", self.branch) + if self.tag is not None: + validate_ref_name("tag", self.tag) + + +def stringify_revision_uri(uri: RevisionUri) -> str: + base = f"dml://{uri.owner}/{uri.project}" + if uri.branch is not None: + return f"{base}#{uri.branch}" + return f"{base}@{uri.tag}" + + +def parse_revision_uri( + uri: str, + *, + default_branch: str | None = None, + require_identifier: bool = False, +) -> RevisionUri: + if not isinstance(uri, str) or not uri.startswith("dml://"): + raise ValueError(f"Invalid DML URI: {uri!r}") + if "#" in uri and "@" in uri: + raise ValueError(f"Invalid DML URI: cannot include both branch and tag: {uri!r}") + + base = uri + branch: str | None = None + tag: str | None = None + if "#" in uri: + base, branch = uri.split("#", 1) + elif "@" in uri: + base, tag = uri.split("@", 1) + + parsed = urlsplit(base) + if parsed.scheme != "dml" or not parsed.netloc or parsed.query or parsed.fragment: + raise ValueError(f"Invalid DML URI: {uri!r}") + project = parsed.path.strip("/") + if "/" in project or not project: + raise ValueError(f"Invalid DML URI project path: {uri!r}") + + if branch is None and tag is None: + if default_branch is not None: + branch = validate_ref_name("branch", default_branch) + elif require_identifier: + raise ValueError(f"DML URI must include a branch or tag: {uri!r}") + else: + raise ValueError("Revision URI parser requires default_branch when selector is omitted") + + return RevisionUri( + owner=validate_segment("project owner", parsed.netloc), + project=validate_segment("project name", project), + branch=validate_ref_name("branch", branch) if branch is not None else None, + tag=validate_ref_name("tag", tag) if tag is not None else None, + ) + + +def canonicalize_revision_uri( + uri: str, + *, + default_branch: str | None = None, + require_identifier: bool = False, +) -> str: + canonical = stringify_revision_uri( + parse_revision_uri(uri, default_branch=default_branch, require_identifier=require_identifier) + ) + if len(canonical.encode("utf-8")) > 64: + raise ValueError("Canonical DML URI exceeds 64-byte ref limit") + return canonical diff --git a/src/daggerml/_internal/types.py b/src/daggerml/_internal/types.py new file mode 100644 index 0000000..c418f4a --- /dev/null +++ b/src/daggerml/_internal/types.py @@ -0,0 +1,824 @@ +"""Data model and type definitions for the DML repository system. + +Contains all data classes, type aliases, constants, and helper functions +without any repository logic or LMDB dependencies. + +Public API: + Data classes - Datum, Error, Dag, Node types, Commit, Tree + Constants - NONE, DEFAULT_HEAD, DEFAULT_USER + Type aliases - Scalar, MaybeRef*, Collection types + Exception - DmlRepoError + Functions - require_ref +""" + +import traceback +from dataclasses import dataclass, field +from getpass import getuser +from typing import TYPE_CHECKING, Any, Optional, Union +from uuid import uuid4 + +try: + from typing import Self, dataclass_transform +except ImportError: + from typing_extensions import Self, dataclass_transform + +from daggerml._internal._db import Ref +from daggerml._internal.util import now + +if TYPE_CHECKING: + from daggerml._internal.ops.base_ops import TxnContext + +# Type aliases for scalar and collection data +Scalar = Optional[Union[int, float, str, bool]] +MaybeRef = Union[Scalar, Ref] # Alias for MaybeRefScalar +MaybeRefScalar = Union[Scalar, Ref] +Collection = Union[list[Scalar], dict[str, Scalar]] +MaybeRefList = list[MaybeRefScalar] +MaybeRefDict = dict[str, MaybeRefScalar] +MaybeRefCollection = Union[MaybeRefList, MaybeRefDict] +RefCollection = Union[list[Ref], dict[str, Ref]] + +# Constants +NONE = uuid4() +DEFAULT_HEAD = "main" +DEFAULT_USER = getuser() + +# Registries for object namespaces and node types +NAMESPACES: dict[str, type] = {} + + +def require_ref( + ref: Any, + expected_ns: Optional[list[str]] = None, + context: Optional[str] = None, +) -> None: + """Validate that a value is a Ref and (optionally) in an expected namespace hierarchy. + + The expected_ns is a list of strings representing the namespace hierarchy to match. + The ref's namespace hierarchy (from ref.nss()) must start with expected_ns. + + Parameters + ---------- + ref : Any + The value to check. + expected_ns : list[str] | None + Expected namespace hierarchy as a list (e.g., ["node"] or ["node", "argv"]). + If None, only the Ref type is validated. + context : str | None + Optional context to include in error messages (typically the + offending `Class.property`). + + Raises + ------ + TypeError + If validation fails. + """ + ctx = f"{context}: " if context else "" + if not isinstance(ref, Ref): + raise TypeError(f"{ctx}expected Ref, got value {ref!r} of type {type(ref).__name__}") + if expected_ns is not None: + hierarchy = ref.nss() + ns = ref.ns() + if hierarchy[: len(expected_ns)] != expected_ns: + raise TypeError(f"{ctx}expected namespace hierarchy {expected_ns}, got {hierarchy} for {ref!r}") + if ns not in NAMESPACES: + raise TypeError(f"{ctx}namespace {ns} not registered for {ref!r}") + + +@dataclass_transform() +def _register_dml_obj(cls) -> type: + """Decorator to register dataclass with namespace. + + Registers the class in NAMESPACES using lowercase class name, + and applies dataclass decorator. + + Parameters + ---------- + cls : type + The class to register. + + Returns + ------- + type + The dataclass-decorated class with _ns attribute. + """ + namespace = cls.__name__.lower() + if namespace not in NAMESPACES: + NAMESPACES[namespace] = cls + obj = dataclass(cls) + obj._ns = namespace + return obj + + +@dataclass +class DmlBase: + """Base class for DML data objects with serialization support.""" + + def to_dict(self) -> dict: + """Convert to dictionary, excluding private attributes. + + Returns + ------- + dict + Dictionary representation excluding keys starting with '_'. + """ + from dataclasses import fields + + return {f.name: getattr(self, f.name) for f in fields(self) if not f.name.startswith("_")} + + @classmethod + def from_dict(cls, d: dict) -> Self: + """Create instance from dictionary. + + Parameters + ---------- + d : dict + Dictionary with field data. + + Returns + ------- + Self + Instance created from dictionary data. + """ + return cls(**d) + + def _validate(self) -> None: + """Default no-op validation for DML objects. + + Subclasses should override to implement strict validation of + field types and expected Ref namespaces. + """ + return + + def __post_init__(self): + """Run validation after dataclass initialization. + + All DmlBase subclasses will call their `_validate` method to + assert field types and namespaces are correct. + """ + self._validate() + + +class Datum(DmlBase): + """Base class for data values in the DML system. + + Datum subclasses represent different types of data: scalars, lists, dicts, + URIs, and runnables. Each subclass is registered under its own + `datum-` namespace. + + Notes + ----- + - Datum *itself* is abstract and is NOT registered as a top-level + namespace. Each concrete Datum subclass is registered under its own + `datum-` namespace and gets its `._ns` attribute set so it + can be stored directly without an explicit `to=` Ref. + """ + + def __init_subclass__(cls, **kwargs): + """Register datum subclasses for deserialization and DB namespace. + + For a subclass `ScalarDatum` we register: + - NAMESPACES['datum-scalar'] = ScalarDatum (for LMDB namespace lookups) + - ScalarDatum._ns = 'datum-scalar' (so instances can be put without `to=`) + """ + super().__init_subclass__(**kwargs) + name = getattr(cls, "__datum_name__", cls.__name__).lower() + if name.endswith("datum"): + name = name[:-5] + # register concrete per-datum namespace, e.g. datum-scalar + concrete_ns = f"datum-{name}" + if concrete_ns not in NAMESPACES: + NAMESPACES[concrete_ns] = cls + # ensure instances have a concrete _ns so BaseOps.put works without `to=` + cls._ns = concrete_ns + + +@dataclass +class ScalarDatum(Datum): + """Datum containing a scalar value. + + Attributes + ---------- + data : Scalar + The scalar value (None, int, float, str, or bool). + """ + + data: Scalar + + def _validate(self) -> None: + if not isinstance(self.data, (type(None), int, float, str, bool)): + raise TypeError( + f"{self.__class__.__name__}.data must be a scalar " + f"(None, int, float, str, bool), got: {type(self.data).__name__}" + ) + + +@dataclass +class ListDatum(Datum): + """Datum containing a list of references to other datums. + + Attributes + ---------- + data : list[Ref] + List of references to datum objects. + """ + + data: list[Ref] # -> datum + + def _validate(self) -> None: + if not isinstance(self.data, list): + raise TypeError(f"{self.__class__.__name__}.data must be a list") + for i, item in enumerate(self.data): + if not isinstance(item, Ref): + raise TypeError(f"{self.__class__.__name__}.data[{i}] must be a Ref, got {type(item).__name__}") + if not item.nss()[0] == "datum": + raise TypeError(f"{self.__class__.__name__}.data[{i}] must be a Ref to datum-*, got {item.ns()}") + + +@dataclass +class DictDatum(Datum): + """Datum containing a dictionary of references to other datums. + + Attributes + ---------- + data : dict[str, Ref] + Dictionary mapping strings to references to datum objects. + """ + + data: dict[str, Ref] # -> datum + + def _validate(self) -> None: + if not isinstance(self.data, dict): + raise TypeError(f"{self.__class__.__name__}.data must be a dict") + for k, v in self.data.items(): + if not isinstance(k, str): + raise TypeError(f"{self.__class__.__name__}.data keys must be strings, got {type(k).__name__}") + if not isinstance(v, Ref): + raise TypeError(f"{self.__class__.__name__}.data[{k!r}] must be a Ref, got {type(v).__name__}") + if not v.nss()[0] == "datum": + raise TypeError(f"{self.__class__.__name__}.data[{k!r}] must be a Ref to datum-*, got {v.ns()}") + + +@dataclass +class Uri(Datum): + """URI reference to an external location. + + Stores a URI string that points to an external location + (e.g., S3 URI, Docker image, file path). + + Attributes + ---------- + uri : str + The URI string. + """ + + uri: str + + def _validate(self) -> None: + if not isinstance(self.uri, str): + raise TypeError(f"{self.__class__.__name__}.uri must be a string, got: {self.uri!r}") + + +@dataclass +class Runnable(DmlBase): + """Public runnable value with fully materialized Python fields.""" + + target: Uri + sub: Optional["Runnable"] = None + kwargs: dict[str, Any] = field(default_factory=dict) + adapter: str = "" + + def _validate(self) -> None: + tname = self.__class__.__name__ + if not isinstance(self.target, Uri): + raise TypeError(f"{tname}.target must be Uri, got {type(self.target).__name__}") + if self.sub is not None and not isinstance(self.sub, Runnable): + raise TypeError(f"{tname}.sub must be Runnable or None, got {type(self.sub).__name__}") + if not isinstance(self.kwargs, dict): + raise TypeError(f"{tname}.kwargs must be a dict") + for k in self.kwargs: + if not isinstance(k, str): + raise TypeError(f"{tname}.kwargs keys must be strings") + if not isinstance(self.adapter, str): + raise TypeError(f"{tname}.adapter must be a string, got: {self.adapter!r}") + + +@dataclass +class RunnableDatum(Datum): + """Specification for an executable computation. + + Represents something that can be executed via an adapter, with default + parameters. Runnables can wrap other runnables for composition + (e.g., "run on AWS Batch" wrapping "run Python process"). + + Attributes + ---------- + target : Ref + Reference to a Uri. + sub : Ref | None + Optional reference to another RunnableDatum (for wrapping). + kwargs : Ref + Reference to a DictDatum mapping keyword names to datum refs. + adapter : str + The adapter name used to execute this runnable. + """ + + __datum_name__ = "runnable" + + target: Ref # -> datum-uri + sub: Optional[Ref] # -> datum-runnable + kwargs: Ref # -> datum-dict + adapter: str + + def _validate(self) -> None: + tname = self.__class__.__name__ + if not isinstance(self.target, Ref): + raise TypeError(f"{tname}.target must be a Ref, got {type(self.target).__name__}") + if self.target.ns() != "datum-uri": + raise TypeError(f"{tname}.target must be a Ref to datum-uri, got {self.target.ns()}") + if self.sub is not None: + if not isinstance(self.sub, Ref): + raise TypeError(f"{tname}.sub must be a Ref or None, got {type(self.sub).__name__}") + if self.sub.ns() != "datum-runnable": + raise TypeError(f"{tname}.sub must be a Ref to datum-runnable, got {self.sub.ns()}") + if not isinstance(self.kwargs, Ref): + raise TypeError(f"{tname}.kwargs must be a Ref, got {type(self.kwargs).__name__}") + if self.kwargs.ns() != "datum-dict": + raise TypeError(f"{tname}.kwargs must be a Ref to datum-dict, got {self.kwargs.ns()}") + if not isinstance(self.adapter, str): + raise TypeError(f"{tname}.adapter must be a string, got: {self.adapter!r}") + + +@_register_dml_obj +class Deletable(DmlBase): + """URI-backed value marked for deletion during garbage collection. + + Signals that this object can be deleted during garbage collection operations. + """ + + uri: str + + def _validate(self) -> None: + if not isinstance(self.uri, str): + raise TypeError(f"{self.__class__.__name__}.uri must be a string, got: {self.uri!r}") + + @classmethod + def from_uri(cls, uri_datum: Uri) -> Self: + """Create deletable from a Uri datum. + + Parameters + ---------- + uri_datum : Uri + The Uri datum to convert. + Returns + ------- + Deletable + A deletable for the given URI. + """ + return cls(uri_datum.uri) + + +@_register_dml_obj +class Error(DmlBase, Exception): + """Error information with stack traces. + + Represents a captured error from a computation, storing error details + and stack trace information for debugging. + + Attributes + ---------- + message : str + The error message. + origin : str + The origin/source of the error (e.g., 'python', 'adapter'). + type : str + The error type name. + stack : list[dict] + Stack trace frames as dictionaries. + """ + + message: str + origin: str + type: str + stack: list[dict] = field(default_factory=list) + + def __post_init__(self): + """Initialize Exception base with message and run base initialization.""" + Exception.__init__(self, self.message) + super().__post_init__() + + def _validate(self) -> None: + if not isinstance(self.message, str): + raise TypeError(f"{self.__class__.__name__}.message must be a string") + if not isinstance(self.origin, str): + raise TypeError(f"{self.__class__.__name__}.origin must be a string") + if not isinstance(self.type, str): + raise TypeError(f"{self.__class__.__name__}.type must be a string") + if not isinstance(self.stack, list): + raise TypeError(f"{self.__class__.__name__}.stack must be a list of frame dicts") + for frame in self.stack: + if not isinstance(frame, dict): + raise TypeError(f"{self.__class__.__name__}.stack frame must be a dict") + + @classmethod + def from_ex(cls, exc) -> Self: + """Create Error from Python exception. + + Parameters + ---------- + exc : Exception + Python exception to convert. + + Returns + ------- + Error + Error object with extracted stack trace. + """ + tb = traceback.extract_tb(exc.__traceback__) + stack = [ + { + "filename": frame.filename, + "lineno": frame.lineno, + "name": frame.name, + "line": frame.line, + } + for frame in tb + ] + return cls( + message=str(exc), + origin="python", + type=type(exc).__name__.lower(), + stack=stack, + ) + + +class DmlRepoError(Error): + """Exception raised by DML repository operations.""" + + def __init__( + self, + message: str, + *, + origin: str = "dml", + type: str = "dmlrepoerror", + stack: Optional[list[dict]] = None, + ): + super().__init__(message=message, origin=origin, type=type, stack=stack or []) + + +class DmlPointerConflictError(DmlRepoError): + """Raised when a branch or index commit update loses a stale-write race.""" + + def __init__(self, message: str, *, current_commit: Ref): + super().__init__(message, type="dmlpointerconflicterror") + self.current_commit = current_commit + + +class Node(DmlBase): + """Base class for computational nodes in a DAG. + + Nodes represent individual computation steps or values in a directed + acyclic graph. They can be literal values, imports from other DAGs, + or function calls. + + Notes + ----- + - Node *itself* is abstract and is NOT registered as a top-level + namespace. Each concrete Node subclass is registered under its own + `node-` namespace and gets its `._ns` attribute set so it + can be stored directly without an explicit `to=` Ref. + """ + + def __init_subclass__(cls, **kwargs): + """Register node subclasses for deserialization and DB namespace. + + For a subclass `ArgvNode` we register: + - NAMESPACES['node-argv'] = ArgvNode (for LMDB namespace lookups) + - ArgvNode._ns = 'node-argv' (so instances can be put without `to=`) + """ + super().__init_subclass__(**kwargs) + name = cls.__name__.lower() + if name.endswith("node"): + name = name[:-4] + # register concrete per-node namespace, e.g. node-argv + concrete_ns = f"node-{name}" + if concrete_ns not in NAMESPACES: + NAMESPACES[concrete_ns] = cls + # ensure instances have a concrete _ns so BaseOps.put works without `to=` + cls._ns = concrete_ns + + def datum_ref(self, txn: "TxnContext") -> Ref: + raise NotImplementedError("Subclasses must implement datum_ref method") + + +@dataclass +class LiteralNode(Node): + """Node containing a literal value or error. + + Attributes + ---------- + value : Ref + Reference to a Datum or Error object. + """ + + value: Ref # => datum + + def _validate(self) -> None: + require_ref(self.value, expected_ns=["datum"], context=f"{self.__class__.__name__}.value") + + def datum_ref(self, txn: "TxnContext") -> Ref: + """Get the Datum reference for this node's value. + + Parameters + ---------- + txn : "TxnContext" + Transaction context to resolve references. + + Returns + ------- + Ref + The Datum reference for this node's value. + """ + return self.value + + +@dataclass +class ArgvNode(LiteralNode): + """Special literal node representing function arguments. + + Used to mark the argv input to a function call in a DAG. + """ + + +@dataclass +class KwargvNode(LiteralNode): + """Special literal node representing function keyword arguments. + + Used to mark the kwargv input to a function call in a DAG. + The value must be a Ref to a Datum containing a dict of str->Ref(datum). + """ + + +@dataclass +class ImportNode(Node): + """Node importing a result from another DAG. + + Attributes + ---------- + dag : Ref + Reference to the source DAG. + node : Ref + Reference to the specific node in that DAG. + """ + + dag: Ref # => dag + node: Ref # => node + + def _validate(self) -> None: + require_ref(self.dag, expected_ns=["dag"], context=f"{self.__class__.__name__}.dag") + require_ref(self.node, expected_ns=["node"], context=f"{self.__class__.__name__}.node") + + def datum_ref(self, txn: "TxnContext") -> Ref: + """Get the value from the imported node. + + Returns + ------- + Ref + The value reference from the imported node. + + Raises + ------ + DmlRepoError + If node not associated with a transaction context. + """ + node = txn.get(self.node) + return node.datum_ref(txn) + + +@dataclass +class FnNode(Node): + """Node representing a function call with arguments. + + Attributes + ---------- + dag : Ref + Reference to the function's DAG. + node : Ref + Reference to the result node. + argv : list[Ref] + List of argument node references. + """ + + argv: list[Ref] # => node + dag: Ref # => dag + + def _validate(self) -> None: + require_ref(self.dag, expected_ns=["dag"], context=f"{self.__class__.__name__}.dag") + if not isinstance(self.argv, list): + raise TypeError("argv must be a list of node Refs") + for a in self.argv: + require_ref(a, expected_ns=["node"], context=f"{self.__class__.__name__}.argv") + + def datum_ref(self, txn: "TxnContext") -> Ref: + """Get the value from the function call node. + + Returns + ------- + Ref + The value reference from the function call node. + + Raises + ------ + DmlRepoError + If node not associated with a transaction context. + """ + dag = txn.get(self.dag) + if dag.error is not None: + raise txn.get(dag.error) + if dag.result is None: + raise DmlRepoError("DAG has no result node") + node = txn.get(dag.result) + return node.datum_ref(txn) + + +@_register_dml_obj +class Dag(DmlBase): + """Directed acyclic graph of computational nodes. + + A DAG represents a complete computation with nodes, named references, + and an optional result node. + + Attributes + ---------- + nodes : list[Ref] + List of node references in this DAG. + names : dict[str, Ref] + Named references to nodes (variable names). + result : Optional[Ref] + The final result node of this computation. + error : Optional[Ref] + Optional reference to an error node if computation failed. + argv : Optional[Ref] + Optional reference to the argv node for function calls. + """ + + nodes: list[Ref] # -> node + names: dict[str, Ref] # -> node + result: Optional[Ref] # -> node + error: Optional[Ref] = None # -> error + argv: Optional[Ref] = None # -> node-argv + + def _validate(self) -> None: + tname = self.__class__.__name__ + if not isinstance(self.nodes, list): + raise TypeError(f"{tname}.nodes must be a list of Refs") + for n in self.nodes: + require_ref(n, expected_ns=["node"], context=f"{tname}.nodes") + if not isinstance(self.names, dict): + raise TypeError(f"{tname}.names must be a dict of str->Ref") + for k, v in self.names.items(): + if not isinstance(k, str): + raise TypeError(f"{tname}.names keys must be strings") + require_ref(v, expected_ns=["node"], context=f"{tname}.names[{k!r}]") + if self.result is not None and self.error is not None: + raise TypeError(f"{tname}: cannot have both result and error") + if self.result is not None: + require_ref(self.result, expected_ns=["node"], context=f"{tname}.result") + if self.error is not None: + require_ref(self.error, expected_ns=["error"], context=f"{tname}.error") + if self.argv is not None: + require_ref(self.argv, expected_ns=["node", "argv"], context=f"{tname}.argv") + + def nameof(self, ref): + """Get the name of a node reference. + + Parameters + ---------- + ref : Ref + The node reference to look up. + + Returns + ------- + str | None + The name of the node, or None if not named. + """ + return {v: k for k, v in self.names.items()}.get(ref) + + def is_finished(self, success: Optional[bool] = None) -> bool: + """Check if the DAG has a result node. + + Parameters + ---------- + success : bool | None + If True, check for successful result (no error). + If False, check for error result. + If None, check for either result or error. + + Returns + ------- + bool + True if the DAG has a result, False otherwise. + """ + if success is True: + return self.result is not None + if success is False: + return self.error is not None + return (self.result or self.error) is not None + + def cache_key(self, txn: "TxnContext") -> str: + """Compute a cache key for this DAG. + + Items in the cache are stored under `{key}`. + + Parameters + ---------- + txn : "TxnContext" + Transaction context to resolve references. + + Returns + ------- + str + The datum_ref.id() of the argv Datum. + """ + if self.argv is None: + raise DmlRepoError("Cannot compute cache key for DAG without argv.") + argv_node = txn.get(self.argv) + return argv_node.value.id() + + +@_register_dml_obj +class Tree(DmlBase): + """Named collection of DAGs. + + A tree organizes multiple DAGs by name, typically representing + different computations or workflow branches. + + Attributes + ---------- + dags : dict[str, Ref] + Mapping of names to DAG references. + """ + + dags: dict[str, Ref] # -> dag + + def _validate(self) -> None: + if not isinstance(self.dags, dict): + raise TypeError("dags must be a dict of str->Ref") + for k, v in self.dags.items(): + if not isinstance(k, str): + raise TypeError(f"{self.__class__.__name__}.dags keys must be strings") + require_ref(v, expected_ns=["dag"], context=f"{self.__class__.__name__}.dags[{k!r}]") + + +@_register_dml_obj +class Commit(DmlBase): + """Versioned snapshot with metadata. + + A commit represents a point-in-time state of the repository, + including the tree, DAG, authorship, and history information. + + Attributes + ---------- + parents : list[Ref] + Parent commit references (empty for initial commit). + tree : Ref + Reference to the Tree for this commit. + author : str + Name of the commit author. + message : str + Commit message describing the change. + dag : Optional[Ref] + Optional reference to the DAG for this commit. + created : str + ISO timestamp when commit was created. + modified : str + ISO timestamp when commit was last modified. + """ + + parents: list[Ref] # -> commit + tree: Ref # -> tree + author: str + message: str + dag: Optional[Ref] = None # -> Dag + created: str = field(default_factory=now) + modified: str = field(default_factory=now) + + def _validate(self) -> None: + if not isinstance(self.parents, list): + raise TypeError("parents must be a list of commit Refs") + for p in self.parents: + require_ref(p, expected_ns=["commit"], context=f"{self.__class__.__name__}.parents") + require_ref(self.tree, expected_ns=["tree"], context=f"{self.__class__.__name__}.tree") + if not isinstance(self.author, str): + raise TypeError(f"{self.__class__.__name__}.author must be a string, got: {self.author!r}") + if not isinstance(self.message, str): + raise TypeError(f"{self.__class__.__name__}.message must be a string, got: {self.message!r}") + if self.dag is not None: + require_ref(self.dag, expected_ns=["dag"], context=f"{self.__class__.__name__}.dag") + if not isinstance(self.created, str): + msg = f"{self.__class__.__name__}.created must be an ISO timestamp string, got: {self.created!r}" + raise TypeError(msg) + if not isinstance(self.modified, str): + msg = f"{self.__class__.__name__}.modified must be an ISO timestamp string, got: {self.modified!r}" + raise TypeError(msg) + diff --git a/src/daggerml/_internal/util.py b/src/daggerml/_internal/util.py new file mode 100644 index 0000000..d248cee --- /dev/null +++ b/src/daggerml/_internal/util.py @@ -0,0 +1,130 @@ +"""Utility functions for the DML repository system. + +Public API: + unnest - Flatten a list of lists + some - Return first truthy value or default + assert_exactly_one - Assert exactly one non-None value + makedirs - Create directories with secure permissions + readfile - Read file contents + writefile - Write file contents + fullname - Get full qualified name of object + now - Get current UTC time as ISO string + as_list - Ensure value is a list + merge_counters - Merge counter dictionaries + tree_map - Apply function to tree structure +""" + +from __future__ import annotations + +import os +import secrets +import time +import uuid +from datetime import datetime, timezone +from typing import Any, Iterable + + +def unnest(nested: Iterable[Iterable[Any]]) -> list: + return [x for xs in nested for x in xs] + + +def some(xs, default=None): + return next((x for x in xs if x), default) + + +def assert_exactly_one(*objs, message=None): + """ + Asserts that exactly one of the provided objects is not None. + """ + count = sum(1 for v in objs if v is not None) + if count != 1: + raise ValueError( + message or f"Exactly one of the provided values must be non-None, but found {count} non-None values: {objs}" + ) + + +def makedirs(path): + os.makedirs(path, mode=0o700, exist_ok=True) + return path + + +def readfile(path, *paths): + if path is not None: + p = os.path.join(path, *paths) + if os.path.exists(p): + with open(p) as f: + result = f.read().strip() + return result or None + + +def writefile(contents, path, *paths): + if path is not None: + p = os.path.join(path, *paths) + if contents is None: + if os.path.exists(p): + os.remove(p) + else: + os.makedirs(os.path.dirname(p), mode=0o700, exist_ok=True) + with open(p, "w") as f: + f.write(contents) + + +def fullname(obj): + if not isinstance(obj, type): + return fullname(type(obj)) + return f"{obj.__module__}.{obj.__qualname__}" + + +def now(): + return datetime.now(timezone.utc).isoformat() + + +def as_list(x) -> list: + return list(x) if isinstance(x, (list, tuple)) else [x] + + +def merge_counters(x, *xs): + if not len(xs): + return x + y, rest = xs[0], xs[1:] + result = {} + for k in set(x.keys()).union(set(y.keys())): + result[k] = unnest([as_list(x.get(k, 0)), as_list(y.get(k, 0))]) + return merge_counters(result, *rest) if len(rest) else result + + +def tree_map(predicate, fn, item): + if predicate(item): + item = fn(item) + if isinstance(item, list): + return [tree_map(predicate, fn, x) for x in item] + if isinstance(item, dict): + return {k: tree_map(predicate, fn, v) for k, v in item.items()} + return item + + +def uuid7() -> uuid.UUID: + """Temporally orderable UUID (up to the millisecond)""" + # Unix timestamp in milliseconds (48 bits) + ts_ms = int(time.time_ns() // 1_000_000) & ((1 << 48) - 1) + # 80 random bits + rand = secrets.randbits(80) + # Layout: + # + # 48b timestamp + # 4b version (0111) + # 12b rand_a + # 2b variant (10) + # 62b rand_b + value = 0 + # timestamp + value |= ts_ms << 80 + # version + value |= 0x7 << 76 + # rand_a (12 bits) + value |= ((rand >> 68) & 0xFFF) << 64 + # variant (RFC 4122 / RFC 9562) + value |= 0b10 << 62 + # rand_b (62 bits) + value |= rand & ((1 << 62) - 1) + return uuid.UUID(int=value) diff --git a/src/daggerml/api.py b/src/daggerml/api.py new file mode 100644 index 0000000..9bf9965 --- /dev/null +++ b/src/daggerml/api.py @@ -0,0 +1,839 @@ +import logging +import time +from contextlib import contextmanager +from contextvars import ContextVar +from dataclasses import dataclass, field +from tempfile import TemporaryDirectory +from typing import Any, Iterator, Optional, Union, cast, overload + +from daggerml._internal import ( + Dml, + DmlRepoError, + Error, + Ref, + Runnable, + Uri, +) +from daggerml.codecs import apply_codecs +from daggerml.util import BackoffWithJitter, current_time_millis + +log = logging.getLogger(__name__) + + +Scalar = Union[str, int, float, bool, type(None), Uri, Runnable] +Collection = Union[list, tuple, dict] +_NO_DEFAULT_DML = object() +_SCOPED_DEFAULT_DML: ContextVar[object] = ContextVar("daggerml_scoped_default_dml", default=_NO_DEFAULT_DML) +_PROCESS_DEFAULT_DML: Optional["Dml"] = None + + +def _resolve_default_dml(*, create: bool = True) -> tuple["Dml", str]: + scoped = _SCOPED_DEFAULT_DML.get() + if scoped is not _NO_DEFAULT_DML: + return cast("Dml", scoped), "scoped" + + global _PROCESS_DEFAULT_DML + if _PROCESS_DEFAULT_DML is not None: + return _PROCESS_DEFAULT_DML, "process" + + if not create: + raise DmlRepoError("No default Dml is configured") + + _PROCESS_DEFAULT_DML = Dml() + return _PROCESS_DEFAULT_DML, "implicit" + + +def get_default_dml() -> "Dml": + """Return the active default Dml runtime.""" + dml, _source = _resolve_default_dml(create=True) + return dml + + +def set_default_dml(dml: "Dml") -> None: + """Set the process-default Dml runtime.""" + global _PROCESS_DEFAULT_DML + _PROCESS_DEFAULT_DML = dml + + +def clear_default_dml() -> None: + """Clear the process-default Dml runtime.""" + global _PROCESS_DEFAULT_DML + _PROCESS_DEFAULT_DML = None + + +@contextmanager +def use_default_dml(dml: "Dml"): + """Temporarily override the default Dml runtime for the active context.""" + token = _SCOPED_DEFAULT_DML.set(dml) + try: + yield dml + finally: + _SCOPED_DEFAULT_DML.reset(token) + + +def new(name="", *, message="", argv_ptr=None, dml: Dml | None = None) -> "Dag": + """Create a new DAG using the active or provided Dml runtime.""" + runtime = dml or get_default_dml() + index_id = runtime.runtime.create(argv_ptr=argv_ptr) + return Dag(dml=runtime, token=index_id, name=name, message=message) + + +def load(name: str, dml=None) -> "Dag": + """Load a DAG using the active default Dml runtime.""" + dml = dml or get_default_dml() + dag_info = dml.dag.get(name) + if dag_info is None: + raise DmlRepoError(f"DAG not found: {name}") + return Dag(dml=dml, ref=dag_info["dag"]["ref"], name=name) + + +@contextmanager +def temporary(**kw): + """Create a temporary Dml runtime with an initial commit.""" + kw.pop("name", None) + with TemporaryDirectory() as tmpdir: + resp = Dml.init(project_home=tmpdir, **kw) + yield Dml(resp["project_home"], remote_root=resp["remote_root"]) + + +def status() -> dict[str, object]: + """Return status for the active default Dml runtime.""" + dml, source = _resolve_default_dml(create=True) + return { + "default": { + "source": source, + "has_scoped_override": _SCOPED_DEFAULT_DML.get() is not _NO_DEFAULT_DML, + "has_process_default": _PROCESS_DEFAULT_DML is not None, + }, + "status": dml.status(), + } + + +def _make_node(dag: "Dag", ref: Ref) -> "Node": + """ + Create a Node from a Dag and Ref. + + Parameters + ---------- + dag : Dag + The parent DAG. + ref : Ref + The reference to the node. + Returns + ------- + Node + A Node instance representing the reference in the DAG. + """ + node_value = dag.dml.dag.get_node(ref)["node"] + info: dict[str, Any] = {"data_type": type(node_value).__name__.lower()} + # Determine node type based on value and populate info + if isinstance(node_value, list): + info["length"] = len(node_value) + node = ListNode(dag, ref, _info=info) + elif isinstance(node_value, dict): + info["length"] = len(node_value) + info["keys"] = list(node_value.keys()) + node = DictNode(dag, ref, _info=info) + elif isinstance(node_value, Runnable): + node = RunnableNode(dag, ref, _info=info) + else: + node = ScalarNode(dag, ref, _info=info) + return node + + +@dataclass +class Dag: + dml: Dml + token: Optional[str] = None # Working index id + ref: Optional[Ref] = None + name: str = "" # DAG name for commit + message: str = "" # Commit message + + def __repr__(self): + to = self.ref.to if self.ref else (self.token if self.token is not None else "NA") + return f"Dag({to})" + + def __hash__(self): + "Useful only for tests." + return 42 + + def __enter__(self): + "Catch exceptions and commit an Error" + assert not self.ref + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_value is not None: + # Convert exception to Error and commit it + err = Error.from_ex(exc_value) if not isinstance(exc_value, Error) else exc_value + self.commit(err) + + def _require_index_ref(self) -> str: + index_id = self.token + if index_id is None: + raise DmlRepoError("No active index") + return index_id + + def _put_literal(self, value: Any, *, name: Optional[str] = None) -> Ref: + index_id = self._require_index_ref() + return self.dml.runtime.put_literal(index_id, value, name=name) + + def _stage_value(self, value: Any, *, name: Optional[str] = None) -> Ref: + return self._put_literal(apply_codecs(value, dag=self), name=name) + + def _start_fn( + self, argv: list[Ref], *, kwargv: Optional[dict[str, Ref]] = None, name: Optional[str] = None + ) -> Optional[Ref]: + return self.dml.runtime.start_fn(self._require_index_ref(), argv, kwargv=kwargv, name=name) + + def _call_builtin(self, uri: str, *args: Any, name: Optional[str] = None) -> Ref: + fn_ref = self._put_literal(Runnable(target=Uri(uri), kwargs={}, adapter="")) + argv: list[Ref] = [fn_ref] + for arg in args: + argv.append(arg if isinstance(arg, Ref) else self._stage_value(arg)) + result = self._start_fn(argv, name=name) + if result is None: + raise DmlRepoError("Function execution failed") + return result + + def __len__(self) -> int: + if self.ref is None: + info = self.dml.runtime.describe(self._require_index_ref()) + return len(info["names"]) + names_dict = self.dml.dag.describe(cast(Ref, self.ref))["dag"]["names"] + return len(names_dict) + + def __iter__(self): + yield from self.keys() + + def _get_named_node(self, name: str) -> "Node": + if self.ref is None: + node_ref = self.dml.runtime.get_node(self._require_index_ref(), name) + return _make_node(self, node_ref) + node_ref = self.dml.dag.describe_node(name, dag=cast(Ref, self.ref))["node"]["ref"] + return _make_node(self, node_ref) + + def _set_named_node(self, name: str, value: Any) -> None: + if self.ref is not None: + raise DmlRepoError("Cannot set node names on a committed DAG.") + if isinstance(value, Ref): + self.dml.runtime.set_node_name(self._require_index_ref(), name, value) + return + self.put(value, name=name) + + def __getitem__(self, name: str) -> "Node": + if not isinstance(name, str): + raise TypeError(f"Dag node name must be str, got {type(name).__name__}") + return self._get_named_node(name) + + def __setitem__(self, name: str, value: Any) -> None: + if not isinstance(name, str): + raise TypeError(f"Dag node name must be str, got {type(name).__name__}") + self._set_named_node(name, value) + + def __getattr__(self, name: str) -> "Node": + if name.startswith("_"): + raise AttributeError(name) + return self[name] + + def __setattr__(self, name: str, value: Any) -> None: + dataclass_fields = getattr(type(self), "__dataclass_fields__", {}) + if name in dataclass_fields or name.startswith("_") or hasattr(type(self), name): + object.__setattr__(self, name, value) + return + self[name] = value + + def keys(self) -> list[str]: + """Get the list of all node names in the dag""" + if self.ref is None: + info = self.dml.runtime.describe(self._require_index_ref()) + return list(info["names"].keys()) + names_dict = self.dml.dag.describe(cast(Ref, self.ref))["dag"]["names"] + return list(names_dict.keys()) + + def values(self) -> list["Node"]: + """Get the list of all nodes in the dag""" + if self.ref is None: + info = self.dml.runtime.describe(self._require_index_ref()) + return [_make_node(self, ref) for ref in info["names"].values()] + names_dict = self.dml.dag.describe(cast(Ref, self.ref))["dag"]["names"] + return [_make_node(self, ref) for ref in names_dict.values()] + + @property + def argv(self) -> "ListNode": + "Access the dag's argv node" + if self.ref is None: + argv_ref = self.dml.runtime.get_argv(self._require_index_ref()) + return cast(ListNode, _make_node(self, argv_ref)) + argv_ref = self.dml.dag.describe(cast(Ref, self.ref))["dag"]["argv"] + assert isinstance(argv_ref, Ref), f"'{self.__class__.__name__}' dag has no argv" + return cast(ListNode, _make_node(self, argv_ref)) + + @property + def result(self) -> "Node": + """Get the result node of the dag""" + ref = self.dml.dag.describe(cast(Ref, self.ref))["dag"].get("result") + assert isinstance(ref, Ref), f"'{self.__class__.__name__}' dag has not been committed yet" + return _make_node(self, ref) + + @overload + def put(self, value: Union[list, "ListNode"], *, name=None) -> "ListNode": ... + @overload + def put(self, value: Union[dict, "DictNode"], *, name=None) -> "DictNode": ... + @overload + def put(self, value: Union[Runnable, "RunnableNode"], *, name=None) -> "RunnableNode": ... + @overload + def put(self, value: Union[Scalar, "ScalarNode"], *, name=None) -> "ScalarNode": ... + @overload + def put(self, value: "Node", *, name=None) -> "Node": ... + def put(self, value: Any, *, name=None) -> "Node": + """ + Add a value to the DAG. + + Parameters + ---------- + value : Union[Scalar, Collection] + Value to add + name : str, optional + Name for the node + Returns + ------- + Node + Node representing the value + + Examples + -------- + >>> import daggerml as _dml + >>> dml = _dml.temporary() + >>> dag = new(dml=dml, name="test", message="test") + >>> n1 = dag.put(42, name="answer") + >>> n1.value() + 42 + >>> n2 = dag.put({"a": 1, "b": [n1, "23"]}) + >>> n2.value() + {'a': 1, 'b': [42, '23']} + >>> n3 = dag.put({"a": 1, "b": [n1, "23"]}) + >>> n3.value() + {'a': 1, 'b': [42, '23']} + """ + return _make_node(self, self._stage_value(value, name=name)) + + def load(self, dag_name: str, node_name: str | None = None, *, name: str | None = None) -> "Node": + """ + Load a node from a different (committed) DAG into the current DAG. + + Parameters + ---------- + dag_name : str + Name of the DAG to load from + node_name : str, optional + Name of the node to load. If None, loads the result node of the DAG. + + Returns + ------- + Node + The loaded node or DAG + + Examples + -------- + >>> import daggerml as _dml + >>> dml = _dml.temporary() + >>> dag = new(dml=dml, name="test", message="test") + >>> n1 = dag.put(42, name="answer") + >>> n2 = dag.put({"a": 1, "b": [n1, "23"]}, name="data") + >>> dag.commit(n2) + >>> dag2 = new(dml=dml, name="test2", message="test2") + >>> loaded_n2 = dag2.load("test", "data", name="loaded_data") + >>> loaded_n2.value() + {'a': 1, 'b': [42, '23']} + """ + if self.ref is None: + raise DmlRepoError("Cannot load from an uncommitted DAG") + index = self.dml.admin.index.get(self._require_index_ref())["index"] + dags = self.dml.dag.list(revision=index["commit"]["ref"].to)["dags"] + dag_info = dags.get(dag_name) + if dag_info is None: + raise DmlRepoError(f"DAG not found: {dag_name}") + dag_info = self.dml.dag.describe(dag_info)["dag"] + node_ref = dag_info["names"].get(node_name) if node_name else dag_info.get("result") + if node_ref is None: + raise DmlRepoError(f"Node '{node_name}' not found in DAG '{dag_name}'") + return _make_node(self, node_ref) + + def call( + self, + fn: Any, + *args: Any, + name: Optional[str] = None, + sleep: Optional[callable] = None, + timeout: int = -1, + **kw, + ) -> "Node": + """ + Call a function node with arguments. + + Parameters + ---------- + fn : Union[Runnable, RunnableNode] + Function to call + *args : Union[Node, Scalar, Collection] + Arguments to pass to the function + name : str, optional + Name for the result node + sleep : callable, optional + A nullary function that returns sleep time in milliseconds + timeout : int, default=-1 + Maximum time to wait in milliseconds. If <= 0, wait indefinitely. + **kw : dict + Keyword arguments override default values on the function specification. + + Returns + ------- + Node + Result node + + Raises + ------ + TimeoutError + If the function call exceeds the timeout + Error + If the function returns an error + """ + kwargv_refs: dict[str, Ref] = {} + for key, value in kw.items(): + kwargv_refs[key] = self._stage_value(value) + + sleep = sleep or BackoffWithJitter() + argv_seed = [fn, *args] + end = current_time_millis() + timeout + while timeout <= 0 or current_time_millis() < end: + argv_refs = [self._stage_value(value) for value in argv_seed] + resp = self._start_fn(argv_refs, kwargv=kwargv_refs, name=name) + if resp: + return _make_node(self, resp) + time.sleep(sleep() / 1000) + raise TimeoutError(f"invoking function: {fn}") + + def commit(self, value) -> None: + """ + Commit a value to the DAG. + + Parameters + ---------- + value : Union[Node, Error, Any] + Value to commit + """ + branch = self.dml.branch()["head"] + if branch is None: + raise DmlRepoError("Current checkout is detached; attach HEAD to commit") + + # For Errors, pass directly to _commit (don't try to store as literal) + if isinstance(value, Error): + commit_ref = self.dml.runtime.commit( + self._require_index_ref(), + value, + head=branch, + message=self.message, + dag_name=self.name, + ) + else: + # For other values, ensure it's a Node and get its ref + value = value if isinstance(value, Node) else self.put(value) + value_ref = value.ref + commit_ref = self.dml.runtime.commit( + self._require_index_ref(), + value_ref, + head=branch, + message=self.message, + dag_name=self.name, + ) + + # Extract the dag ref from the commit + self.ref = self.dml.dag.list(revision=commit_ref.to)["dags"][self.name] + + +@dataclass(frozen=True) +class Node: # noqa: F811 + """ + Representation of a node in a DaggerML DAG. + + Parameters + ---------- + dag : Dag + Parent DAG + ref : Ref + Node reference + """ + + dag: Dag + ref: Ref + _info: dict = field(default_factory=dict) + + def __repr__(self): + ref_id = self.ref if isinstance(self.ref, Error) else self.ref.to + return f"{self.__class__.__name__}({ref_id})" + + def __hash__(self): + return hash(self.ref) + + def __eq__(self, other): + if not isinstance(other, Node): + return NotImplemented + return self.ref == other.ref + + @property + def argv(self) -> list["Node"]: + "Access the node's argv list" + node_info = self.dag.dml.dag.describe_node(self.ref)["node"] + argv = node_info.get("argv") + if argv is None: + raise Error("Node has no argv", origin="dml", type="TypeError") + return [_make_node(self.dag, ref) for ref in argv] + + def backtrack(self, *keys: Union[str, int]) -> "Node": + """ + If `key` is provided, it considers this node to be a collection created + by the appropriate method and loads the dag that corresponds to this key + + Parameters + ---------- + *keys : str, optional + Keys to backtrack through the node's structure + + Returns + ------- + Dag + The dag that this node was imported from (or in the case of a function call, this returns the fndag) + + Examples + -------- + >>> import daggerml as _dml + >>> dml = _dml.temporary() + >>> dag = new(dml=dml, name="test", message="test") + >>> l0 = dag.put(42) + >>> c0 = dag.put({"a": 1, "b": [l0, "23"]}) + >>> assert c0.backtrack("b", 0) == l0 + >>> assert c0.backtrack("b").backtrack(0) == l0 + >>> assert c0["b"][0] != l0 # this is a different node, not the same as l0 + >>> dml.cleanup() + """ + raise NotImplementedError("Node backtracking is temporarily disabled and will be reintroduced later.") + + def load(self) -> Dag: + """ + Load this node's execution context (DAG). + + Returns + ------- + Dag + This node's execution dag. + """ + node_info = self.dag.dml.dag.describe_node(self.ref)["node"] + dag_ref = node_info.get("dag") + if isinstance(dag_ref, Ref): + return Dag(dml=self.dag.dml, ref=dag_ref) + return self.dag + + @property + def type(self): + """Get the data type of the node.""" + return self._info["data_type"] + + @overload + def value(self: "ScalarNode") -> Scalar: ... + @overload + def value(self: "ListNode") -> list: ... + @overload + def value(self: "DictNode") -> dict: ... + @overload + def value(self: "RunnableNode") -> Runnable: ... + @overload + def value(self: "Node") -> Any: ... + def value(self): + """ + Get the concrete value of this node. + + Returns + ------- + Any + The actual value represented by this node + """ + return self.dag.dml.dag.unroll_node(self.ref)["node"] + + def __call__(self, *args, name=None, sleep=None, timeout=-1, **kw) -> "Node": + raise TypeError(f"Node of type '{self.type}' is not callable") + + +class ScalarNode(Node): + pass + + +class RunnableNode(Node): + def __call__(self, *args, name=None, sleep=None, timeout=-1, **kw) -> "Node": + """ + Call this node as a function. + + Parameters + ---------- + *args : Any + Arguments to pass to the function + name : str, optional + Name for the result node + sleep : callable, optional + A nullary function that returns sleep time in milliseconds + timeout : int, default=-1 + Maximum time to wait in milliseconds. -1 means wait forever. + **kw : dict + Keyword arguments override runnable defaults. + + Returns + ------- + Node + Result node + + Raises + ------ + TimeoutError + If the function call exceeds the timeout + Error + If the function returns an error + """ + return self.dag.call(self, *args, name=name, sleep=sleep, timeout=timeout, **kw) + + +class CollectionNode(Node): # noqa: F811 + """ + Representation of a collection node in a DaggerML DAG. + + Parameters + ---------- + dag : Dag + Parent DAG + ref : Ref + Node reference + """ + + def contains(self, item, *, name=None) -> "ScalarNode": + """ + For collection nodes, checks to see if `item` is in `self` + + Returns + ------- + Node + Node with the boolean of is `item` in `self` + """ + item_ref = item.ref if isinstance(item, Node) else item + result = self.dag._call_builtin("daggerml:contains", self.ref, item_ref, name=name) + return cast(ScalarNode, _make_node(self.dag, result)) + + def __contains__(self, item): + return self.contains(item).value() # has to return boolean + + def __len__(self): # python requires this to be an int + """ + Get the node's length + + Returns + ------- + Node + Node with the length of the collection + + Raises + ------ + Error + If the node isn't a collection (e.g. list or dict). + """ + return self._info["length"] + + +class ListNode(CollectionNode): # noqa: F811 + """ + Representation of a collection node in a DaggerML DAG. + + Parameters + ---------- + dag : Dag + Parent DAG + ref : Ref + Node reference + """ + + @overload + def __getitem__(self, key: Union[slice, list[int]]) -> "ListNode": ... + @overload + def __getitem__(self, key: Union[int, "Node"]) -> "Node": ... + def __getitem__(self, key: Union[slice, list[int], int, "Node"]) -> "Node": + if isinstance(key, slice): + if key.step is not None: + raise ValueError("Slice step is not supported") + start = key.start if key.start is not None else 0 + stop = key.stop if key.stop is not None else len(self) + key = [start, stop] + return _make_node(self.dag, self.dag._call_builtin("daggerml:get", self.ref, key)) + + def __iter__(self): + """ + Iterate over the node's values (items if it's a list, and keys if it's a + dict) + + Returns + ------- + Node + Result node + + Raises + ------ + Error + If the node isn't a collection (e.g. list or dict). + """ + for i in range(len(self)): + yield self[i] + + def conj(self, item, *, name=None) -> "ListNode": + """ + For a list node, append an item + + Returns + ------- + Node + Node containing the new collection + + Notes + ----- + `append` is an alias `conj` + """ + item_ref = item.ref if isinstance(item, Node) else item + resp = self.dag._call_builtin("daggerml:conj", self.ref, item_ref, name=name) + return cast(ListNode, _make_node(self.dag, resp)) + + def append(self, item, *, name=None) -> "ListNode": + """ + For a list node, append an item + + Returns + ------- + Node + Node containing the new collection + + See Also + -------- + conj : The main implementation + """ + return self.conj(item, name=name) + + +class DictNode(CollectionNode): # noqa: F811 + def __getitem__(self, key: Union[str, "Node"]) -> "Node": + return _make_node(self.dag, self.dag._call_builtin("daggerml:get", self.ref, key)) + + def keys(self) -> list[str]: + """ + Get the keys of a dictionary node. + + Parameters + ---------- + name : str, optional + Name for the result node + + Returns + ------- + list[str] + List of keys in the dictionary node + """ + return self._info["keys"].copy() + + def __iter__(self): + """ + Iterate over the node's values (items if it's a list, and keys if it's a + dict) + + Returns + ------- + Node + Result node + + Raises + ------ + Error + If the node isn't a collection (e.g. list or dict). + """ + for k in self.keys(): + yield k + + def get(self, key, default=None, *, name=None) -> "Node": + """ + For a dict node, return the value for key if key exists, else default. + + If default is not given, it defaults to None, so that this method never raises a KeyError. + """ + return _make_node(self.dag, self.dag._call_builtin("daggerml:get", self.ref, key, default, name=name)) + + def items(self) -> Iterator[tuple[str, "Node"]]: + """ + Iterate over key-value pairs of a dictionary node. + + Returns + ------- + Iterator[tuple[Node, Node]] + Iterator over (key, value) pairs + """ + if self.type != "dict": + raise Error(f"Cannot iterate items of type: {self.type}", origin="dml", type="TypeError") + for k in self: + yield k, self[k] + + def values(self) -> list["Node"]: + """ + Get the values of a dictionary node. + + Parameters + ---------- + name : str, optional + Name for the result node + + Returns + ------- + list[Node] + List of values in the dictionary node + """ + return [self[k] for k in self] + + def assoc(self, key, value, *, name=None) -> "DictNode": + """ + For a dict node, associate a new value into the map + + Returns + ------- + Node + Node containing the new dict + """ + value_ref = value.ref if isinstance(value, Node) else value + resp = self.dag._call_builtin("daggerml:assoc", self.ref, key, value_ref, name=name) + return cast(DictNode, _make_node(self.dag, resp)) + + def update(self, update) -> "DictNode": + """ + For a dict node, update like python dicts + + Returns + ------- + Node + Node containing the new collection + + Notes + ----- + calls `assoc` iteratively for k, v pairs in update. + + See Also + -------- + assoc : The main implementation + """ + for k, v in update.items(): + self = self.assoc(k, v) + return self + + +def codecs() -> list[Any]: + from daggerml.codecs import codecs as builtins + + return builtins() diff --git a/src/daggerml/codecs.py b/src/daggerml/codecs.py new file mode 100644 index 0000000..45d8138 --- /dev/null +++ b/src/daggerml/codecs.py @@ -0,0 +1,198 @@ +"""Codec registry, built-in codecs, and DAG-owned staging helpers.""" + +from __future__ import annotations + +from dataclasses import dataclass +from importlib import metadata +from threading import RLock +from typing import TYPE_CHECKING, Any, Iterator, Protocol + +from daggerml._internal import DmlRepoError, Error, Runnable +from daggerml._internal._db import Ref + +if TYPE_CHECKING: + from daggerml.api import Dag, Node + + +LITERAL_CODEC_ENTRYPOINT_GROUP = "daggerml.codecs" + + +class CodecError(Error): + def __init__(self, message: str): + super().__init__(message, origin="dml-codec", type="codec-error") + + +@dataclass(frozen=True) +class DelayedRef: + name: str + + +@dataclass(frozen=True) +class DelayedLoad: + dagname: str + nodename: str | None = None + + +@dataclass(frozen=True) +class DelayedRunnable: + uri: str + adapter: str + sub: Any | DelayedRunnable | None + kwargs: dict[str, Any] + + +class LiteralCodec(Protocol): + def can_encode(self, value: Any) -> bool: ... + + def encode(self, value: Any, dag: "Dag") -> Any: ... + + +_literal_codecs: list[tuple[int, int, LiteralCodec]] = [] +_literal_codec_seq = 0 +_plugins_loaded = False +_lock = RLock() + + +def _is_codec(value: Any) -> bool: + return callable(getattr(value, "can_encode", None)) and callable(getattr(value, "encode", None)) + + +def _register_unlocked(codec: LiteralCodec, *, priority: int) -> None: + global _literal_codec_seq + _literal_codec_seq += 1 + _literal_codecs.append((priority, _literal_codec_seq, codec)) + _literal_codecs.sort(key=lambda item: (-item[0], item[1])) + + +def register_codec(codec: LiteralCodec, *, priority: int = 0) -> None: + with _lock: + _register_unlocked(codec, priority=priority) + + +def _entry_points() -> list[metadata.EntryPoint]: + points = metadata.entry_points() + result = list(points.select(group=LITERAL_CODEC_ENTRYPOINT_GROUP)) + result.sort(key=lambda ep: (ep.name, ep.value)) + return result + + +def _register_plugin_value(value: Any, *, source: str) -> None: + if _is_codec(value): + _register_unlocked(value, priority=0) + return + if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int) and _is_codec(value[0]): + _register_unlocked(value[0], priority=value[1]) + return + if isinstance(value, (list, tuple)): + for item in value: + _register_plugin_value(item, source=source) + return + raise CodecError(f"Literal codec plugin '{source}' returned invalid codec registration") + + +def ensure_literal_codec_plugins_loaded() -> None: + global _plugins_loaded + with _lock: + if _plugins_loaded: + return + for entry_point in _entry_points(): + source = f"{entry_point.name} ({entry_point.value})" + try: + loaded = entry_point.load() + if _is_codec(loaded): + _register_unlocked(loaded, priority=0) + continue + value = loaded() if callable(loaded) else loaded + _register_plugin_value(value, source=source) + except CodecError: + raise + except Exception as e: + raise CodecError(f"Literal codec plugin '{source}' failed: {e}") from e + _plugins_loaded = True + + +def iter_literal_codecs() -> Iterator[LiteralCodec]: + ensure_literal_codec_plugins_loaded() + with _lock: + codecs = [codec for _priority, _seq, codec in _literal_codecs] + yield from codecs + + +def apply_codec(value: Any, *, dag: "Dag") -> Any: + for codec in iter_literal_codecs(): + try: + if codec.can_encode(value): + return codec.encode(value, dag) + except Exception as e: + if isinstance(e, DmlRepoError): + raise + raise CodecError(f"Literal codec {codec.__class__.__name__} failed: {e}") from e + return value + + +def apply_codecs(value: Any, *, dag: "Dag") -> Any: + value = apply_codec(value, dag=dag) + if isinstance(value, (list, tuple)): + return [apply_codecs(v, dag=dag) for v in value] + if isinstance(value, dict): + return {k: apply_codecs(v, dag=dag) for k, v in value.items()} + if isinstance(value, Runnable): + target = apply_codecs(value.target, dag=dag) + sub = apply_codecs(value.sub, dag=dag) + kwargs = {k: apply_codecs(v, dag=dag) for k, v in value.kwargs.items()} + return Runnable(target=target, adapter=value.adapter, kwargs=kwargs, sub=sub) + return value + + +class NodeCodec: + def can_encode(self, value: Any) -> bool: + from daggerml import api as core_api + + return isinstance(value, core_api.Node) + + def encode(self, value: "Node", dag: "Dag") -> Ref: + assert dag.token is not None, "DAG must have a token to encode nodes" + if value.dag.token is not None and value.dag.token == dag.token: + return value.ref + if value.dag.ref is None: + raise CodecError("Cannot encode node from uncommitted DAG in a different index") + try: + return dag.dml.runtime.put_import(dag._require_index_ref(), value.dag.ref, node=value.ref, name=None) + except Exception as e: + raise CodecError(f"Failed to encode cross-dag node import: {e}") from e + + +class DelayedActionCodec: + def can_encode(self, value: Any) -> bool: + return isinstance(value, (DelayedRef, DelayedLoad, DelayedRunnable)) + + def encode(self, value: DelayedRef | DelayedLoad | DelayedRunnable, dag: "Dag"): + if isinstance(value, DelayedRef): + return apply_codecs(dag[value.name], dag=dag) + if isinstance(value, DelayedRunnable): + from daggerml.contrib.adapter_registry import get_adapter + + adapter_spec = get_adapter(value.adapter) + uri = apply_codecs(value.uri, dag=dag) + kwargs = apply_codecs(value.kwargs, dag=dag) + sub = apply_codecs(value.sub, dag=dag) + resolved = adapter_spec.resolve_runnable(uri, kwargs, sub) + if not isinstance(resolved, Runnable): + raise CodecError("Adapter resolve_runnable must return Runnable") + return resolved + assert isinstance(value, DelayedLoad) + index = dag.dml.admin.index.get(dag._require_index_ref())["index"] + commit_ref = index["commit"]["ref"] + resolved = dag.dml.dag.get(value.dagname, revision=commit_ref.to)["dag"] + dag_ref = resolved["ref"] + if value.nodename is None: + node_ref = resolved["result"] + else: + node_ref = resolved["names"].get(value.nodename) + if node_ref is None: + raise DmlRepoError(f"Node '{value.nodename}' not found in DAG '{value.dagname}'") + return dag.dml.runtime.put_import(dag._require_index_ref(), dag_ref, node=node_ref, name=None) + + +def codecs() -> list[Any]: + return [NodeCodec(), DelayedActionCodec()] diff --git a/src/daggerml/contrib/__init__.py b/src/daggerml/contrib/__init__.py new file mode 100644 index 0000000..f5b4297 --- /dev/null +++ b/src/daggerml/contrib/__init__.py @@ -0,0 +1 @@ +"""Contrib public package.""" diff --git a/src/daggerml/contrib/adapter_registry.py b/src/daggerml/contrib/adapter_registry.py new file mode 100644 index 0000000..01bde7d --- /dev/null +++ b/src/daggerml/contrib/adapter_registry.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from importlib import metadata +from threading import Lock +from typing import Any + +from daggerml._internal import DmlRepoError + +ADAPTER_ENTRYPOINT_GROUP = "daggerml.contrib.adapters" + +_LOCK = Lock() +_ADAPTER_SPECS: dict[str, Any] = {} +_PLUGINS_LOADED = False + + +def _entry_points() -> list[metadata.EntryPoint]: + points = metadata.entry_points() + result = list(points.select(group=ADAPTER_ENTRYPOINT_GROUP)) + result.sort(key=lambda ep: (ep.name, ep.value)) + return result + + +def _validate_adapter_spec(spec: Any) -> tuple[str, Any]: + if not hasattr(spec, "name"): + raise DmlRepoError("Adapter spec missing required attribute: name") + if not hasattr(spec, "executable"): + raise DmlRepoError("Adapter spec missing required attribute: executable") + if not hasattr(spec, "resolve_runnable"): + raise DmlRepoError("Adapter spec missing required attribute: resolve_runnable") + if not hasattr(spec, "send"): + raise DmlRepoError("Adapter spec missing required callable: send") + if not hasattr(spec, "cli"): + raise DmlRepoError("Adapter spec missing required callable: cli") + + name = spec.name + if not isinstance(name, str) or not name: + raise DmlRepoError("Adapter spec name must be a non-empty string") + executable = spec.executable + if not isinstance(executable, str) or not executable: + raise DmlRepoError("Adapter spec executable must be a non-empty string") + if not callable(spec.resolve_runnable): + raise DmlRepoError("Adapter spec missing required callable: resolve_runnable") + if not callable(spec.send): + raise DmlRepoError("Adapter spec missing required callable: send") + if not callable(spec.cli): + raise DmlRepoError("Adapter spec missing required callable: cli") + return name, spec + + +def register_adapter(spec: Any) -> None: + name, normalized = _validate_adapter_spec(spec) + with _LOCK: + _ADAPTER_SPECS[name] = normalized + + +def _register_plugin_value(value: Any, *, source: str) -> None: + try: + register_adapter(value) + return + except DmlRepoError: + pass + + if isinstance(value, (list, tuple, set)): + for item in value: + _register_plugin_value(item, source=source) + return + + if callable(value): + _register_plugin_value(value(), source=source) + return + + raise DmlRepoError(f"Adapter plugin '{source}' returned invalid adapter registration") + + +def load_adapter_plugins() -> None: + global _PLUGINS_LOADED + with _LOCK: + if _PLUGINS_LOADED: + return + entry_points = _entry_points() + for ep in entry_points: + source = f"{ep.name} ({ep.value})" + try: + loaded = ep.load() + _register_plugin_value(loaded, source=source) + except Exception as e: + raise DmlRepoError(f"Adapter plugin '{source}' failed: {e}") from e + with _LOCK: + _PLUGINS_LOADED = True + + +def get_adapter(name: str) -> Any: + load_adapter_plugins() + with _LOCK: + spec = _ADAPTER_SPECS.get(name) + if spec is None: + raise DmlRepoError(f"Adapter '{name}' is not registered") + return spec + + +def list_adapters() -> list[str]: + load_adapter_plugins() + with _LOCK: + return sorted(_ADAPTER_SPECS.keys()) + + +def _reset_for_tests() -> None: + global _PLUGINS_LOADED + with _LOCK: + _ADAPTER_SPECS.clear() + _PLUGINS_LOADED = False diff --git a/src/daggerml/contrib/adapters.py b/src/daggerml/contrib/adapters.py new file mode 100644 index 0000000..1750ac4 --- /dev/null +++ b/src/daggerml/contrib/adapters.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import argparse +import inspect +import json +import sys +import time +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import boto3 + +from daggerml._internal import DmlRepoError, ExecutionState, Runnable, Uri, execution_context +from daggerml.contrib.executor_registry import get_executor +from daggerml.contrib.s3 import S3Store, is_s3_uri +from daggerml.util import get_client + + +class AdapterBase: + name = "" + + @classmethod + def resolve_runnable(cls, uri, kwargs, sub): + spec = get_executor(cls.name, uri) + resolved = spec.resolve_runnable(uri, kwargs, sub) + if not isinstance(resolved, Runnable): + raise DmlRepoError(f"Executor '{uri}' resolve_runnable must return Runnable") + return resolved + + @classmethod + def send( + cls, + *, + runnable: Runnable, + argv_ptr: str, + cache_key: str, + execution_id: str, + remote: dict[str, str], + state: dict[str, Any] | None, + execution_status: str | None, + cancel_requested_by: str | None, + ): + raise NotImplementedError("Adapter send method is not implemented") + + @classmethod + def _dump_payload( + cls, + *, + runnable: Runnable, + argv_ptr: str, + cache_key: str, + execution_id: str, + remote: dict[str, str], + state: dict[str, Any] | None, + execution_status: str | None = None, + cancel_requested_by: str | None = None, + ) -> bytes: + def _encode(value: Any) -> Any: + if isinstance(value, Runnable): + return { + "target": value.target.uri, + "kwargs": {k: _encode(v) for k, v in value.kwargs.items()}, + "adapter": value.adapter, + "sub": None if value.sub is None else _encode(value.sub), + } + if isinstance(value, Uri): + return value.uri + if isinstance(value, dict): + return {k: _encode(v) for k, v in value.items()} + if isinstance(value, list): + return [_encode(v) for v in value] + if isinstance(value, tuple): + return [_encode(v) for v in value] + return value + + payload: dict[str, Any] = { + "runnable": _encode(runnable), + "argv_ptr": argv_ptr, + "cache_key": cache_key, + "execution_id": execution_id, + "remote": _encode(remote), + "state": _encode(state), + "execution_status": execution_status, + "cancel_requested_by": cancel_requested_by, + } + return json.dumps(payload).encode("utf-8") + + @staticmethod + def _call_with_supported_kwargs(fn, **kwargs): + signature = inspect.signature(fn) + if any(param.kind is inspect.Parameter.VAR_KEYWORD for param in signature.parameters.values()): + return fn(**kwargs) + supported = {name: value for name, value in kwargs.items() if name in signature.parameters} + return fn(**supported) + + @staticmethod + def _decode_runnable(value: Any) -> Runnable: + if isinstance(value, Runnable): + return value + if not isinstance(value, dict): + raise DmlRepoError("Adapter runnable payload must be a dict") + target = value.get("target") + kwargs = value.get("kwargs", {}) + adapter = value.get("adapter") + sub = value.get("sub") + if not isinstance(target, str): + raise DmlRepoError("Adapter runnable target must be a string") + if not isinstance(kwargs, dict): + raise DmlRepoError("Adapter runnable kwargs must be a dict") + if not isinstance(adapter, str): + raise DmlRepoError("Adapter runnable adapter must be a string") + return Runnable( + target=Uri(target), + kwargs=kwargs, + adapter=adapter, + sub=(None if sub is None else AdapterBase._decode_runnable(sub)), + ) + + @classmethod + def _parse_payload( + cls, payload: dict + ) -> tuple[str, str, str, Runnable, dict[str, str], dict[str, Any] | None, str | None, str | None]: + argv_ptr = payload["argv_ptr"] + cache_key = payload["cache_key"] + execution_id = payload["execution_id"] + remote = payload["remote"] + state = payload.get("state") + execution_status = payload.get("execution_status") + cancel_requested_by = payload.get("cancel_requested_by") + if not isinstance(argv_ptr, str): + raise DmlRepoError("Adapter payload argv_ptr must be a string") + if not isinstance(cache_key, str): + raise DmlRepoError("Adapter payload cache_key must be a string") + if not isinstance(execution_id, str): + raise DmlRepoError("Adapter payload execution_id must be a string") + if not isinstance(remote, dict): + raise DmlRepoError("Adapter payload remote must be a dict") + if state is not None and not isinstance(state, dict): + raise DmlRepoError("Adapter payload state must be a dict or null") + if execution_status is not None and not isinstance(execution_status, str): + raise DmlRepoError("Adapter payload execution_status must be a string or null") + if cancel_requested_by is not None and not isinstance(cancel_requested_by, str): + raise DmlRepoError("Adapter payload cancel_requested_by must be a string or null") + return ( + argv_ptr, + cache_key, + execution_id, + cls._decode_runnable(payload["runnable"]), + remote, + state, + execution_status, + cancel_requested_by, + ) + + @staticmethod + def _validate_output(result): + if not isinstance(result, dict): + raise DmlRepoError("Adapter output must be a dict") + status = result.get("status") + if status not in {"running", "succeeded", "failed", "cancel-detached"}: + raise DmlRepoError("Adapter output status must be one of running|succeeded|failed|cancel-detached") + allowed_keys = {"status", "error"} + if status == "succeeded": + allowed_keys.add("dag_id") + elif status == "running": + allowed_keys.add("state") + extra = set(result.keys()) - allowed_keys + if extra: + raise DmlRepoError(f"Adapter output has unexpected keys: {', '.join(sorted(extra))}") + error = result.get("error") + if status == "failed": + if error is None: + raise DmlRepoError("Adapter output failed requires error") + elif status == "cancel-detached": + if error is not None: + raise DmlRepoError("Adapter output cancel-detached requires error=None") + elif status == "running": + if error is not None: + raise DmlRepoError("Adapter output running requires error=None") + state = result.get("state") + if not isinstance(state, dict): + raise DmlRepoError("Adapter output running requires state") + else: + if error is not None: + raise DmlRepoError("Adapter output succeeded requires error=None") + dag_id = result.get("dag_id") + if not isinstance(dag_id, str) or not dag_id: + raise DmlRepoError("Adapter output succeeded requires dag_id") + return result + + @classmethod + def _read_input(cls, input_path: str) -> str: + if input_path == "-": + return sys.stdin.read() + if is_s3_uri(input_path): + return S3Store().get(input_path).decode("utf-8") + return Path(input_path).read_text() + + @classmethod + def _write_output(cls, output_path: str, data: str) -> None: + if output_path == "-": + sys.stdout.write(data) + if not data.endswith("\n"): + sys.stdout.write("\n") + sys.stdout.flush() + return + if is_s3_uri(output_path): + parsed = urlparse(output_path) + bucket = parsed.netloc + key = parsed.path.lstrip("/") + boto3.client("s3").put_object( + Bucket=bucket, + Key=key, + Body=data.encode("utf-8"), + ContentType="application/json", + ) + return + Path(output_path).write_text(data) + + @staticmethod + def _refresh_execution_payload( + *, cache_key: str, execution_id: str, remote: dict[str, str], fallback_state: dict[str, Any] | None + ) -> tuple[dict[str, Any] | None, str | None, str | None]: + state_backend = ExecutionState(cache_key, remote_root=remote["root"]) + launch_state = state_backend.read_launch_state(execution_id) + record = state_backend.read_execution_record(execution_id) + if launch_state is None and record is None: + return fallback_state, None, None + state = fallback_state + if state is None and launch_state is not None: + state = launch_state.get("resume_state") + return state, (record.get("lifecycle") if record is not None else None), ( + record.get("cancellation_requested_by") if record is not None else None + ) + + @classmethod + def cli(cls, argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=f"{cls.__name__} CLI") + parser.add_argument("-i", "--input", default="-") + parser.add_argument("-o", "--output", default="-") + parser.add_argument("--poll", action="store_true") + args = parser.parse_args(argv) + + raw = cls._read_input(args.input) + payload = json.loads(raw) + ( + argv_ptr, + cache_key, + execution_id, + runnable, + remote, + state, + execution_status, + cancel_requested_by, + ) = cls._parse_payload(payload) + with execution_context(execution_id, cache_key): + result = cls.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=state, + execution_status=execution_status, + cancel_requested_by=cancel_requested_by, + ) + persisted_state = state + current_status = execution_status + current_cancel_requested_by = cancel_requested_by + while args.poll and result.get("status") not in {"succeeded", "failed", "cancel-detached"}: + persisted_state, current_status, current_cancel_requested_by = cls._refresh_execution_payload( + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + fallback_state=persisted_state if persisted_state is not None else result.get("state"), + ) + time.sleep(0.05) + result = cls.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=persisted_state, + execution_status=current_status, + cancel_requested_by=current_cancel_requested_by, + ) + cls._write_output(args.output, json.dumps(result)) + return 0 + + +class LocalAdapter(AdapterBase): + name = "local" + executable = "dml-local-adapter" + + @classmethod + def send( + cls, + *, + runnable: Runnable, + argv_ptr: str, + cache_key: str, + execution_id: str, + remote: dict[str, str], + state: dict[str, Any] | None, + execution_status: str | None = None, + cancel_requested_by: str | None = None, + ): + spec = get_executor("local", runnable.target.uri) + if not hasattr(spec, "handle"): + raise DmlRepoError(f"Executor '{runnable.target.uri}' does not support handle()") + result = spec.handle( + cache_key=cache_key, + execution_id=execution_id, + state=state, + execution_status=execution_status, + cancel_requested_by=cancel_requested_by, + runnable=runnable, + argv_ptr=argv_ptr, + remote=remote, + ) + return cls._validate_output(result) + + +class LambdaAdapter(AdapterBase): + name = "lambda" + executable = "dml-lambda-adapter" + + @classmethod + def send( + cls, + *, + runnable: Runnable, + argv_ptr: str, + cache_key: str, + execution_id: str, + remote: dict[str, str], + state: dict[str, Any] | None, + execution_status: str | None = None, + cancel_requested_by: str | None = None, + ): + client = get_client("lambda") + response = client.invoke( + FunctionName=runnable.target.uri, + InvocationType="RequestResponse", + Payload=cls._dump_payload( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=state, + execution_status=execution_status, + cancel_requested_by=cancel_requested_by, + ), + ) + stream = response.get("Payload") + if stream is None: + raise DmlRepoError("Lambda adapter invoke response missing Payload") + body = stream.read().decode("utf-8") + try: + result = json.loads(body) if body else {} + except json.JSONDecodeError as e: + raise DmlRepoError(f"Lambda adapter response payload must be JSON: {e}") from e + return cls._validate_output(result) diff --git a/src/daggerml/contrib/api.py b/src/daggerml/contrib/api.py new file mode 100644 index 0000000..0e2d559 --- /dev/null +++ b/src/daggerml/contrib/api.py @@ -0,0 +1,596 @@ +from __future__ import annotations + +import ast +import inspect +import linecache +from dataclasses import dataclass, fields, is_dataclass +from functools import wraps +from pathlib import Path +from textwrap import dedent +from typing import Any, Callable, Protocol, TypeAlias, TypeVar, cast, overload + +from daggerml import api as core_api +from daggerml._internal import DmlRepoError, Runnable +from daggerml.codecs import DelayedLoad, DelayedRef, DelayedRunnable + +try: + from typing import dataclass_transform +except ImportError: + from typing_extensions import dataclass_transform + +_DAGCLASS_CALL_NODE_NAME = "" +_DAGCLASS_RESERVED_NAMES = {"dag", "dml", "argv", "call", "put", "commit"} + + +def _iter_dagclass_members(instance): + members = getattr(instance, "__dagclass_members__", None) + order = getattr(instance, "__dagclass_member_order__", None) + if isinstance(members, dict) and isinstance(order, list): + for name in order: + if name in members: + yield name, members[name] + return + seen: set[str] = set() + for f in fields(instance): + name = f.name + if name.startswith("_"): + continue + seen.add(name) + yield name, getattr(instance, name) + for name, class_value in instance.__class__.__dict__.items(): + if name.startswith("_") or name in seen: + continue + if callable(class_value): + continue + yield name, getattr(instance, name) + + +class _DagclassAnalyzer(ast.NodeVisitor): + def __init__(self, *, member_names: set[str], method_names: set[str]): + self.member_names = member_names + self.method_names = method_names + self.dependencies: list[str] = [] + self._dep_set: set[str] = set() + + def _add_dependency(self, name: str) -> None: + if name not in self._dep_set: + self._dep_set.add(name) + self.dependencies.append(name) + + def _unsupported(self, msg: str) -> None: + raise DmlRepoError(msg) + + def _read_self_name(self, name: str, assigned: set[str]) -> None: + if name not in self.member_names: + raise DmlRepoError(f"Unknown dagclass member reference: self.{name}") + if name not in assigned: + self._add_dependency(name) + + def _assign_self_name(self, name: str) -> None: + if name not in self.member_names: + raise DmlRepoError(f"Unknown dagclass member assignment: self.{name}") + if name in self.method_names: + raise DmlRepoError(f"Cannot assign to compiled dagclass method: self.{name}") + + def _visit_expr(self, node: ast.AST, assigned: set[str]) -> None: + if isinstance(node, ast.Attribute): + if isinstance(node.value, ast.Name) and node.value.id == "self": + if isinstance(node.ctx, ast.Load): + self._read_self_name(node.attr, assigned) + return + if isinstance(node.ctx, ast.Del): + self._unsupported("dagclass methods do not support del self.") + return + self._visit_expr(node.value, assigned) + return + if isinstance(node, ast.Call): + if isinstance(node.func, ast.Name) and node.func.id in {"getattr", "setattr", "hasattr"}: + if node.args and isinstance(node.args[0], ast.Name) and node.args[0].id == "self": + self._unsupported(f"dagclass methods do not support {node.func.id}(self, ...)") + self._visit_expr(node.func, assigned) + for arg in node.args: + self._visit_expr(arg, assigned) + for kw in node.keywords: + if kw.value is not None: + self._visit_expr(kw.value, assigned) + return + if isinstance(node, ast.Subscript): + self._visit_expr(node.value, assigned) + self._visit_expr(node.slice, assigned) + return + if isinstance( + node, + ( + ast.ListComp, + ast.SetComp, + ast.DictComp, + ast.GeneratorExp, + ast.Lambda, + ast.Yield, + ast.YieldFrom, + ast.Await, + ), + ): + self._unsupported("dagclass methods do not support dynamic or deferred self-capturing constructs") + for child in ast.iter_child_nodes(node): + if isinstance(child, ast.expr): + self._visit_expr(child, assigned) + + def _assign_target(self, target: ast.AST) -> set[str]: + names: set[str] = set() + if isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name) and target.value.id == "self": + self._assign_self_name(target.attr) + names.add(target.attr) + elif isinstance(target, (ast.Tuple, ast.List)): + for elt in target.elts: + names.update(self._assign_target(elt)) + elif isinstance(target, ast.Subscript): + if isinstance(target.value, ast.Name) and target.value.id == "self": + return names + self._visit_expr(target.value, set()) + self._visit_expr(target.slice, set()) + return names + + def _visit_stmt_list(self, stmts: list[ast.stmt], assigned_in: set[str]) -> set[str]: + assigned = set(assigned_in) + for stmt in stmts: + assigned = self._visit_stmt(stmt, assigned) + return assigned + + def _visit_stmt(self, stmt: ast.stmt, assigned: set[str]) -> set[str]: + if isinstance(stmt, ast.Return): + if stmt.value is not None: + self._visit_expr(stmt.value, assigned) + return set(assigned) + if isinstance(stmt, ast.Expr): + self._visit_expr(stmt.value, assigned) + return set(assigned) + if isinstance(stmt, ast.Assign): + self._visit_expr(stmt.value, assigned) + out = set(assigned) + for target in stmt.targets: + out.update(self._assign_target(target)) + return out + if isinstance(stmt, ast.AnnAssign): + if stmt.value is not None: + self._visit_expr(stmt.value, assigned) + out = set(assigned) + out.update(self._assign_target(stmt.target)) + return out + if isinstance(stmt, ast.AugAssign): + if ( + isinstance(stmt.target, ast.Attribute) + and isinstance(stmt.target.value, ast.Name) + and stmt.target.value.id == "self" + ): + self._read_self_name(stmt.target.attr, assigned) + self._visit_expr(stmt.target, assigned) + self._visit_expr(stmt.value, assigned) + out = set(assigned) + out.update(self._assign_target(stmt.target)) + return out + if isinstance(stmt, ast.If): + self._visit_expr(stmt.test, assigned) + body_out = self._visit_stmt_list(stmt.body, set(assigned)) + orelse_out = self._visit_stmt_list(stmt.orelse, set(assigned)) + return body_out & orelse_out + if isinstance(stmt, (ast.For, ast.AsyncFor, ast.While)): + if isinstance(stmt, ast.For): + self._visit_expr(stmt.iter, assigned) + self._assign_target(stmt.target) + elif isinstance(stmt, ast.AsyncFor): + self._unsupported("dagclass methods do not support async for") + else: + self._visit_expr(stmt.test, assigned) + self._visit_stmt_list(stmt.body, set(assigned)) + orelse_out = self._visit_stmt_list(stmt.orelse, set(assigned)) + return set(assigned) & orelse_out + if isinstance(stmt, (ast.With, ast.AsyncWith)): + if isinstance(stmt, ast.AsyncWith): + self._unsupported("dagclass methods do not support async with") + for item in stmt.items: + self._visit_expr(item.context_expr, assigned) + if item.optional_vars is not None: + self._assign_target(item.optional_vars) + return self._visit_stmt_list(stmt.body, set(assigned)) + if isinstance(stmt, ast.Delete): + for target in stmt.targets: + if ( + isinstance(target, ast.Attribute) + and isinstance(target.value, ast.Name) + and target.value.id == "self" + ): + self._unsupported("dagclass methods do not support del self.") + return set(assigned) + if isinstance( + stmt, + ( + ast.FunctionDef, + ast.AsyncFunctionDef, + ast.ClassDef, + ast.Try, + ast.TryStar, + ast.Raise, + ast.Match, + ast.Assert, + ast.Global, + ast.Nonlocal, + ), + ): + self._unsupported(f"dagclass methods do not support statement type: {type(stmt).__name__}") + return set(assigned) + + def analyze(self, fn: ast.FunctionDef) -> list[str]: + self._visit_stmt_list(fn.body, set()) + return list(self.dependencies) + + +def _make_self_helper_class() -> ast.ClassDef: + return cast( + ast.ClassDef, + ast.parse( + "class _DagclassSelf:\n" + " def __getitem__(self, key):\n" + " return getattr(self, key)\n" + " def __setitem__(self, key, value):\n" + " setattr(self, key, value)\n" + ).body[0], + ) + + +def _load_self_attr(name: str) -> ast.Assign: + return ast.Assign( + targets=[ast.Attribute(value=ast.Name(id="self", ctx=ast.Load()), attr=name, ctx=ast.Store())], + value=ast.Subscript( + value=ast.Name(id="dag", ctx=ast.Load()), + slice=ast.Constant(value=name), + ctx=ast.Load(), + ), + ) + + +def _function_from_source(source: str, fn_name: str): + filename = f"" + lines = [line + "\n" for line in source.splitlines()] + linecache.cache[filename] = (len(source), None, lines, filename) + namespace: dict[str, Any] = {} + exec(compile(source, filename, "exec"), namespace, namespace) + return namespace[fn_name] + + +def _compile_plain_dagclass_method(*, cls, method_name: str, method, member_names: set[str], method_names: set[str]): + try: + source = dedent(inspect.getsource(method)) + except (OSError, TypeError) as e: + raise DmlRepoError(f"Failed to inspect dagclass method source for {cls.__name__}.{method_name}: {e}") from e + + module = ast.parse(source) + if len(module.body) != 1 or not isinstance(module.body[0], ast.FunctionDef): + raise DmlRepoError(f"dagclass method source for {cls.__name__}.{method_name} must be a single function") + fn = module.body[0] + if fn.decorator_list: + raise DmlRepoError(f"dagclass method {cls.__name__}.{method_name} has unsupported decorators") + if not fn.args.args or fn.args.args[0].arg != "self": + raise DmlRepoError(f"dagclass method {cls.__name__}.{method_name} must declare self as first parameter") + + analyzer = _DagclassAnalyzer(member_names=member_names, method_names=method_names) + dependencies = analyzer.analyze(fn) + + compiled_fn = ast.FunctionDef( + name=method_name, + args=ast.arguments( + posonlyargs=[], + args=[ast.arg(arg="dag", annotation=None), *fn.args.args[1:]], + vararg=fn.args.vararg, + kwonlyargs=fn.args.kwonlyargs, + kw_defaults=fn.args.kw_defaults, + kwarg=fn.args.kwarg, + defaults=fn.args.defaults, + ), + body=[ + _make_self_helper_class(), + ast.Assign( + targets=[ast.Name(id="self", ctx=ast.Store())], + value=ast.Call(func=ast.Name(id="_DagclassSelf", ctx=ast.Load()), args=[], keywords=[]), + ), + *[_load_self_attr(name) for name in dependencies], + *fn.body, + ], + decorator_list=[], + returns=fn.returns, + type_comment=fn.type_comment, + ) + ast.fix_missing_locations(compiled_fn) + compiled_source = ast.unparse(compiled_fn) + "\n" + compiled_callable = _function_from_source(compiled_source, method_name) + delayed = funkify( + compiled_callable, uri="script", adapter="local", prepop={name: ref(name) for name in dependencies} + ) + return delayed, dependencies + + +def _collect_member_dependencies(value: Any, member_names: set[str]) -> set[str]: + deps: set[str] = set() + + def visit(obj: Any) -> None: + if isinstance(obj, DelayedRef): + if obj.name not in member_names: + raise DmlRepoError(f"Unknown dagclass member reference: {obj.name}") + deps.add(obj.name) + return + if isinstance(obj, DelayedLoad): + return + if isinstance(obj, DelayedRunnable): + visit(obj.sub) + visit(obj.kwargs) + return + if isinstance(obj, Runnable): + visit(obj.sub) + visit(obj.kwargs) + return + if isinstance(obj, dict): + for key, value in obj.items(): + visit(key) + visit(value) + return + if isinstance(obj, (list, tuple, set, frozenset)): + for item in obj: + visit(item) + return + + visit(value) + return deps + + +def _toposort_members(member_deps: dict[str, set[str]], order_hint: list[str]) -> list[str]: + ordered: list[str] = [] + temp: set[str] = set() + done: set[str] = set() + + def visit(name: str) -> None: + if name in done: + return + if name in temp: + raise DmlRepoError(f"dagclass member dependency cycle detected at: {name}") + temp.add(name) + for dep in sorted( + member_deps.get(name, set()), + key=lambda item: order_hint.index(item) if item in order_hint else len(order_hint), + ): + visit(dep) + temp.remove(name) + done.add(name) + ordered.append(name) + + for name in order_hint: + visit(name) + if set(ordered) != set(order_hint): + raise DmlRepoError("dagclass member ordering is incomplete or inconsistent") + return ordered + + +def _bind_dagclass_value(value): + if getattr(value.__class__, "__dagclass__", False): + entrypoint = getattr(value.__class__, "__dagclass_entrypoint__", "main") + if not hasattr(value, entrypoint): + raise DmlRepoError(f"Dagclass instance missing configured entrypoint: {entrypoint}") + return getattr(value, entrypoint) + return value + + +FunkifyInput: TypeAlias = Callable[..., Any] | Runnable | DelayedRunnable +DagclassType = TypeVar("DagclassType", bound=type[Any]) + + +class _DagclassProtocol(Protocol): + __dagclass__: bool + __dagclass_entrypoint__: str + __dagclass_wrapped_init__: bool + + def __init__(self, *args: Any, **kwargs: Any) -> None: ... + + +def is_node_like(x: object) -> bool: + """Return True if x is a Node or any Delayed* type (DelayedRef, DelayedLoad, DelayedRunnable).""" + return isinstance(x, (core_api.Node, DelayedRef, DelayedLoad, DelayedRunnable)) + + +def _ensure_contrib_codecs() -> None: + return None + + +def ref(name: str) -> DelayedRef: + _ensure_contrib_codecs() + return DelayedRef(name) + + +def load(dagname: str, nodename: str | None = None) -> DelayedLoad: + _ensure_contrib_codecs() + return DelayedLoad(dagname=dagname, nodename=nodename) + + +def _compile_dagclass_instance(instance) -> None: + if getattr(instance, "__dagclass_compiled__", False): + return + + members: dict[str, Any] = {} + declaration_order: list[str] = [] + method_defs: dict[str, Any] = {} + field_names = {f.name for f in fields(instance)} + + for f in fields(instance): + current = getattr(instance, f.name) + bound = _bind_dagclass_value(current) + if bound is not current: + setattr(instance, f.name, bound) + members[f.name] = getattr(instance, f.name) + declaration_order.append(f.name) + + for name, class_value in instance.__class__.__dict__.items(): + if name.startswith("_"): + continue + if name in field_names: + continue + if isinstance(class_value, (staticmethod, classmethod, property)): + raise DmlRepoError(f"dagclass member {name} uses unsupported descriptor type: {type(class_value).__name__}") + if inspect.isfunction(class_value): + method_defs[name] = class_value + continue + if callable(class_value): + raise DmlRepoError(f"dagclass member {name} uses unsupported callable type: {type(class_value).__name__}") + if name in instance.__dict__: + members[name] = getattr(instance, name) + declaration_order.append(name) + continue + bound = _bind_dagclass_value(class_value) + if bound is not class_value: + setattr(instance, name, bound) + members[name] = getattr(instance, name) + declaration_order.append(name) + + member_names = set(members.keys()) | set(method_defs.keys()) + method_names = set(method_defs.keys()) + reserved = sorted(member_names & _DAGCLASS_RESERVED_NAMES) + if reserved: + bad = ", ".join(reserved) + raise DmlRepoError(f"dagclass uses reserved names: {bad}") + compiled_methods: dict[str, Any] = {} + for name, method in method_defs.items(): + compiled, deps = _compile_plain_dagclass_method( + cls=instance.__class__, + method_name=name, + method=method, + member_names=member_names, + method_names=method_names, + ) + compiled_methods[name] = compiled + for name, compiled in compiled_methods.items(): + setattr(instance, name, compiled) + members[name] = compiled + declaration_order.append(name) + + member_deps: dict[str, set[str]] = {} + for name, value in members.items(): + deps = _collect_member_dependencies(value, set(members.keys())) + if name in deps: + raise DmlRepoError(f"dagclass member dependency cycle detected at: {name}") + member_deps[name] = deps + + order = _toposort_members(member_deps, declaration_order) + + instance.__dagclass_members__ = members + instance.__dagclass_member_order__ = order + instance.__dagclass_member_deps__ = member_deps + instance.__dagclass_compiled__ = True + instance.__dagclass_compile_count__ = getattr(instance, "__dagclass_compile_count__", 0) + 1 + + +@dataclass_transform() +@overload +def dagclass( + _cls: None = None, *, entrypoint: str = "main", **dataclass_kwargs: Any +) -> Callable[[DagclassType], DagclassType]: ... +@overload +def dagclass(_cls: DagclassType, *, entrypoint: str = "main", **dataclass_kwargs: Any) -> DagclassType: ... +def dagclass( + _cls: DagclassType | None = None, *, entrypoint: str = "main", **dataclass_kwargs: Any +) -> Callable[[DagclassType], DagclassType] | DagclassType: + def wrap(cls: DagclassType) -> DagclassType: + if not is_dataclass(cls): + cls = dataclass(cls, **dataclass_kwargs) + elif dataclass_kwargs: + bad = ", ".join(sorted(dataclass_kwargs.keys())) + raise DmlRepoError(f"api.dagclass dataclass kwargs not allowed on pre-dataclass class: {bad}") + cls = cast(DagclassType, cls) + dagclass_cls = cast(_DagclassProtocol, cls) + dagclass_cls.__dagclass__ = True + dagclass_cls.__dagclass_entrypoint__ = entrypoint + if getattr(dagclass_cls, "__dagclass_wrapped_init__", False): + return cls + original_init = dagclass_cls.__init__ + + @wraps(original_init) + def _dagclass_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + _compile_dagclass_instance(self) + + dagclass_cls.__init__ = _dagclass_init + dagclass_cls.__dagclass_wrapped_init__ = True + return cls + + if _cls is None: + return wrap + return wrap(_cls) + + +def _default_run_name(instance) -> str: + module = __import__(instance.__class__.__module__, fromlist=["__name__"]) + if not getattr(module, "__file__", None): + return f"{instance.__class__.__module__}::{instance.__class__.__name__}" + module_file = Path(module.__file__).resolve() + repo_root = None + for parent in (module_file.parent, *module_file.parents): + if (parent / ".git").exists(): + repo_root = parent + break + base = repo_root if repo_root is not None else Path.cwd().resolve() + try: + rel = module_file.relative_to(base) + except ValueError: + rel = module_file + rel_no_ext = rel.with_suffix("").as_posix() + return f"{rel_no_ext}::{instance.__class__.__name__}" + + +def run(instance, *args, name: str | None = None, entrypoint: str | None = None, **kwargs): + _ensure_contrib_codecs() + + if not getattr(instance.__class__, "__dagclass__", False): + raise DmlRepoError("api.run instance is not a dagclass instance") + if not getattr(instance, "__dagclass_compiled__", False): + raise DmlRepoError("api.run instance is not compiled") + + entry = entrypoint or getattr(instance.__class__, "__dagclass_entrypoint__", "main") + if not hasattr(instance, entry): + raise DmlRepoError(f"api.run entrypoint not found: {entry}") + fn = getattr(instance, entry) + if not isinstance(fn, DelayedRunnable): + raise DmlRepoError("api.run entrypoint must be DelayedRunnable") + run_name = name or _default_run_name(instance) + dml = core_api.get_default_dml() + dag = core_api.new(dml=dml, name=run_name, message=run_name) + for member_name, member_value in _iter_dagclass_members(instance): + dag.put(member_value, name=member_name) + result = dag.call(fn, *args, name=_DAGCLASS_CALL_NODE_NAME, **kwargs) + dag.commit(result) + + +@overload +def funkify( + sub_or_fn: None = None, *, adapter: str = "local", uri: str = "script", **kwargs: Any +) -> Callable[[FunkifyInput], DelayedRunnable]: ... +@overload +def funkify( + sub_or_fn: Callable[..., Any], *, adapter: str = "local", uri: str = "script", **kwargs: Any +) -> DelayedRunnable: ... +@overload +def funkify( + sub_or_fn: Runnable | DelayedRunnable, *, adapter: str = "local", uri: str = "script", **kwargs: Any +) -> DelayedRunnable: ... +def funkify( + sub_or_fn: FunkifyInput | None = None, *, adapter: str = "local", uri: str = "script", **kwargs: Any +) -> Callable[[FunkifyInput], DelayedRunnable] | DelayedRunnable: + _ensure_contrib_codecs() + + def _make(value: FunkifyInput) -> DelayedRunnable: + if callable(value): + if "fn" in kwargs: + raise DmlRepoError("Unknown kwarg: fn") + return DelayedRunnable(uri=uri, adapter=adapter, sub=None, kwargs={"fn": value, **kwargs}) + if isinstance(value, (Runnable, DelayedRunnable)): + return DelayedRunnable(uri=uri, adapter=adapter, sub=value, kwargs=dict(kwargs)) + raise DmlRepoError(f"Invalid funkify input: {type(value).__name__}") + + if sub_or_fn is None: + return _make + return _make(sub_or_fn) diff --git a/src/daggerml/contrib/codecs.py b/src/daggerml/contrib/codecs.py new file mode 100644 index 0000000..3f64910 --- /dev/null +++ b/src/daggerml/contrib/codecs.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import importlib +from io import BytesIO +from typing import Any + +from daggerml import Uri +from daggerml.contrib.s3 import S3Store + + +def _import_optional(module_name: str) -> Any | None: + try: + return importlib.import_module(module_name) + except ModuleNotFoundError as e: + if e.name == module_name: + return None + raise + + +class PandasDataFrameCodec: + def __init__(self, dataframe_type: type[Any]): + self._dataframe_type = dataframe_type + + def can_encode(self, value: Any) -> bool: + return isinstance(value, self._dataframe_type) + + def encode(self, value: Any, ctx: Any) -> Uri: + buf = BytesIO() + value.to_parquet(buf) + return S3Store().put(data=buf.getvalue(), suffix=".parquet") + + +class PolarsDataFrameCodec: + def __init__(self, dataframe_type: type[Any]): + self._dataframe_type = dataframe_type + + def can_encode(self, value: Any) -> bool: + return isinstance(value, self._dataframe_type) + + def encode(self, value: Any, ctx: Any) -> Uri: + buf = BytesIO() + value.write_parquet(buf) + return S3Store().put(data=buf.getvalue(), suffix=".parquet") + + +def literal_codecs() -> list[Any]: + codecs: list[Any] = [] + + pandas = _import_optional("pandas") + if pandas is not None: + codecs.append(PandasDataFrameCodec(pandas.DataFrame)) + + polars = _import_optional("polars") + if polars is not None: + codecs.append(PolarsDataFrameCodec(polars.DataFrame)) + + return codecs diff --git a/src/daggerml/contrib/executor_registry.py b/src/daggerml/contrib/executor_registry.py new file mode 100644 index 0000000..07ff219 --- /dev/null +++ b/src/daggerml/contrib/executor_registry.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from importlib import metadata +from threading import Lock +from typing import Any + +from daggerml._internal import DmlRepoError + +EXECUTOR_ENTRYPOINT_GROUP = "daggerml.contrib.executors" + +_LOCK = Lock() +_EXECUTOR_SPECS: dict[tuple[str, str], Any] = {} +_PLUGINS_LOADED = False + + +def _entry_points() -> list[metadata.EntryPoint]: + points = metadata.entry_points() + result = list(points.select(group=EXECUTOR_ENTRYPOINT_GROUP)) + result.sort(key=lambda ep: (ep.name, ep.value)) + return result + + +def _validate_executor_spec(spec: Any) -> tuple[str, str, Any]: + if not hasattr(spec, "adapter"): + raise DmlRepoError("Executor spec missing required attribute: adapter") + adapter = spec.adapter + if not isinstance(adapter, str) or not adapter: + raise DmlRepoError("Executor spec adapter must be a non-empty string") + if not hasattr(spec, "name"): + raise DmlRepoError("Executor spec missing required attribute: name") + name = spec.name + if not isinstance(name, str) or not name: + raise DmlRepoError("Executor spec missing required attribute: name") + if hasattr(spec, "resolve_runnable") and callable(spec.resolve_runnable): + # Front-end only (resolve_runnable without lifecycle) is valid, + # but if lifecycle callables are present they must be valid. + has_start = hasattr(spec, "start") + has_cleanup = hasattr(spec, "cleanup") + if has_start or has_cleanup: + if not has_start or not has_cleanup: + raise DmlRepoError( + "Executor spec with resolve_runnable has partial lifecycle: needs both start and cleanup" + ) + if not callable(spec.start) or not callable(spec.cleanup): + raise DmlRepoError("Executor attributes: start, cleanup must be callable") + if hasattr(spec, "poll") and not callable(spec.poll): + raise DmlRepoError("Executor defines poll attribute but it is not callable") + return adapter, name, spec + # back-end only executor spec must have start and cleanup callables + if not hasattr(spec, "start") or not hasattr(spec, "cleanup"): + raise DmlRepoError("Executor spec missing required callables: start, cleanup") + if not callable(spec.start) or not callable(spec.cleanup): + raise DmlRepoError("Executor attributes: start, cleanup must be callable") + if hasattr(spec, "poll") and not callable(spec.poll): + raise DmlRepoError("Executor defines poll attribute but it is not callable") + return adapter, name, spec + + +def register_executor(spec: Any) -> None: + adapter, name, normalized = _validate_executor_spec(spec) + with _LOCK: + _EXECUTOR_SPECS[(adapter, name)] = normalized + + +def _register_plugin_value(value: Any, *, source: str) -> None: + try: + register_executor(value) + return + except DmlRepoError: + pass + + if isinstance(value, (list, tuple, set)): + for item in value: + _register_plugin_value(item, source=source) + return + + if callable(value): + _register_plugin_value(value(), source=source) + return + + raise DmlRepoError(f"Executor plugin '{source}' returned invalid executor registration") + + +def load_executor_plugins() -> None: + global _PLUGINS_LOADED + with _LOCK: + if _PLUGINS_LOADED: + return + entry_points = _entry_points() + for ep in entry_points: + source = f"{ep.name} ({ep.value})" + try: + loaded = ep.load() + _register_plugin_value(loaded, source=source) + except Exception as e: + raise DmlRepoError(f"Executor plugin '{source}' failed: {e}") from e + with _LOCK: + _PLUGINS_LOADED = True + + +def get_executor(adapter: str, name: str) -> Any: + load_executor_plugins() + with _LOCK: + spec = _EXECUTOR_SPECS.get((adapter, name)) + if spec is None: + raise DmlRepoError(f"Executor '{name}' is not registered for adapter '{adapter}'") + return spec + + +def list_executors(adapter: str | None = None) -> list[str]: + load_executor_plugins() + with _LOCK: + if adapter is None: + return sorted(name for _adapter, name in _EXECUTOR_SPECS.keys()) + return sorted(name for _adapter, name in _EXECUTOR_SPECS.keys() if _adapter == adapter) + + +def _reset_for_tests() -> None: + global _PLUGINS_LOADED + with _LOCK: + _EXECUTOR_SPECS.clear() + _PLUGINS_LOADED = False diff --git a/src/daggerml/contrib/executors/__init__.py b/src/daggerml/contrib/executors/__init__.py new file mode 100644 index 0000000..fbaef31 --- /dev/null +++ b/src/daggerml/contrib/executors/__init__.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from daggerml.contrib.executors._base import ExecutorBase +from daggerml.contrib.executors.batch import BatchExecutor +from daggerml.contrib.executors.cfn import CfnExecutor +from daggerml.contrib.executors.docker import DockerExecutor +from daggerml.contrib.executors.script import ScriptExecutor +from daggerml.contrib.executors.ssh import SshExecutor + +__all__ = ["ExecutorBase", "BatchExecutor", "CfnExecutor", "DockerExecutor", "ScriptExecutor", "SshExecutor"] diff --git a/src/daggerml/contrib/executors/_base.py b/src/daggerml/contrib/executors/_base.py new file mode 100644 index 0000000..67de527 --- /dev/null +++ b/src/daggerml/contrib/executors/_base.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Any + +from daggerml._internal import Runnable + + +class ExecutorBase: + """Base class for all executors. + + The runtime owns durable resumable state. Executors receive ``state=None`` + on first launch and the immutable persisted state on later polls. Executors + return terminal or in-progress result dicts via stdout/return value: + + {"status": "running", "error": null, "state": {...}} + {"status": "succeeded", "error": null, "dag_id": ""} + {"status": "failed", "error": ""} + """ + + name: str = "" + adapter: str = "" + execution_status: str | None = None + cancel_requested_by: str | None = None + + # ------------------------------------------------------------------ + # Subclass interface + # ------------------------------------------------------------------ + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + """Launch execution and return a result dict. + + For synchronous executors this should return the terminal result + immediately. For async executors, return the durable resume state in the + initial ``running`` result. + """ + raise NotImplementedError + + def poll( + self, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any], + remote: dict[str, str], + ) -> dict[str, Any]: + """Check an in-flight job and return a result dict. + + ``state`` is the immutable launch-time state returned by ``start()``. + Return a terminal result when done, or ``{"status": "running", + "error": None, "state": ...}`` while still running. Later returned + state may be ignored by the runtime. + """ + raise NotImplementedError + + def cleanup(self, *, cache_key: str, execution_id: str, remote: dict[str, str], state: dict[str, Any]) -> None: + """Optional cleanup hook called after terminal result is handled. + + Default is a no-op. Subclasses may override to terminate external + resources (containers, batch jobs, etc.) if needed after the executor + is known to be done. + """ + + def cancel( + self, *, cache_key: str, execution_id: str, state: dict[str, Any], remote: dict[str, str] + ) -> dict[str, Any]: + self.cleanup(cache_key=cache_key, execution_id=execution_id, remote=remote, state=state) + return {"status": "cancel-detached", "error": None} + + # ------------------------------------------------------------------ + # Main dispatch + # ------------------------------------------------------------------ + + @classmethod + def handle( + cls, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any] | None, + execution_status: str | None, + cancel_requested_by: str | None, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + """Call start or poll depending on whether immutable state exists.""" + executor = cls() + executor.execution_status = execution_status + executor.cancel_requested_by = cancel_requested_by + if execution_status == "cancel-pending" and state is not None: + return executor.cancel( + cache_key=cache_key, + execution_id=execution_id, + state=state, + remote=remote, + ) + if state is None: + return executor.start( + cache_key=cache_key, + execution_id=execution_id, + runnable=runnable, + argv_ptr=argv_ptr, + remote=remote, + ) + return executor.poll( + cache_key=cache_key, + execution_id=execution_id, + state=state, + remote=remote, + ) diff --git a/src/daggerml/contrib/executors/_lambda.py b/src/daggerml/contrib/executors/_lambda.py new file mode 100644 index 0000000..18ced3d --- /dev/null +++ b/src/daggerml/contrib/executors/_lambda.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import traceback + +from daggerml._internal import Runnable, execution_context +from daggerml.contrib.adapters import AdapterBase +from daggerml.contrib.executors._base import ExecutorBase + + +class LambdaExecutorBase(ExecutorBase): + adapter = "lambda" + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ): + raise NotImplementedError("LambdaExecutorBase.start must be implemented by subclasses") + + @classmethod + def handler(cls, event, context): + del context + try: + argv_ptr, cache_key, execution_id, runnable, remote, state, execution_status, cancel_requested_by = ( + AdapterBase._parse_payload(event) + ) + with execution_context(execution_id, cache_key): + result = cls.handle( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=state, + execution_status=execution_status, + cancel_requested_by=cancel_requested_by, + ) + return AdapterBase._validate_output(result) + except Exception as e: + error = f"Lambda handler failed: {e}\n\n{traceback.format_exc()}" + return {"status": "failed", "error": error} diff --git a/src/daggerml/contrib/executors/batch.py b/src/daggerml/contrib/executors/batch.py new file mode 100644 index 0000000..a520cae --- /dev/null +++ b/src/daggerml/contrib/executors/batch.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +import json +import os +from typing import Any + +from daggerml import Uri +from daggerml._internal import DmlRepoError, ExecutionState, Runnable +from daggerml.contrib.adapters import AdapterBase +from daggerml.contrib.executors._lambda import LambdaExecutorBase +from daggerml.util import get_client + +PENDING_BATCH_STATUSES = {"SUBMITTED", "PENDING", "RUNNABLE", "STARTING", "RUNNING"} +DEFAULT_VCPU = 1 +DEFAULT_MEMORY = 16 * 1024 +DEFAULT_GPU = 0 + +_ADAPTER_IO_NAME = "lambda:batch" + + +class BatchExecutor(LambdaExecutorBase): + name = "batch" + + @staticmethod + def _string(name: str, value: Any) -> str: + if not isinstance(value, str) or not value: + raise DmlRepoError(f"batch executor {name} must be a non-empty string") + return value + + @staticmethod + def _int(name: str, value: Any, *, default: int, min_value: int = 0) -> int: + if value is None: + return default + if not isinstance(value, int) or value < min_value: + raise DmlRepoError(f"batch executor {name} must be an int >= {min_value}") + return value + + @classmethod + def _image_uri(cls, value: Any) -> Uri: + if not isinstance(value, Uri): + raise DmlRepoError("batch executor image must be a Uri") + return value + + @classmethod + def resolve_runnable(cls, uri, kwargs, sub): + if sub is None: + raise DmlRepoError("batch executor requires sub runnable") + unknown = sorted(set(kwargs.keys()) - {"lambda_uri", "image", "cpu", "memory", "gpu"}) + if unknown: + raise DmlRepoError(f"Unknown batch executor kwargs: {', '.join(unknown)}") + return Runnable( + target=Uri(cls._string("lambda_uri", kwargs.get("lambda_uri"))), + adapter="dml-lambda-adapter", + kwargs={ + "image": cls._image_uri(kwargs.get("image")), + "cpu": cls._int("cpu", kwargs.get("cpu"), default=DEFAULT_VCPU, min_value=1), + "memory": cls._int("memory", kwargs.get("memory"), default=DEFAULT_MEMORY, min_value=1), + "gpu": cls._int("gpu", kwargs.get("gpu"), default=DEFAULT_GPU, min_value=0), + }, + sub=sub, + ) + + @staticmethod + def _client(): + return get_client("batch") + + @classmethod + def _resource_requirements(cls, kwargs: dict[str, Any]) -> tuple[list[dict[str, str]], str]: + cpu = cls._int("cpu", kwargs.get("cpu"), default=DEFAULT_VCPU, min_value=1) + memory = cls._int("memory", kwargs.get("memory"), default=DEFAULT_MEMORY, min_value=1) + gpu = cls._int("gpu", kwargs.get("gpu"), default=DEFAULT_GPU, min_value=0) + reqs = [ + {"type": "MEMORY", "value": str(memory)}, + {"type": "VCPU", "value": str(cpu)}, + ] + queue_env = "CPU_QUEUE" + if gpu > 0: + reqs.append({"type": "GPU", "value": str(gpu)}) + queue_env = "GPU_QUEUE" + return reqs, cls._string(queue_env, os.environ.get(queue_env)) + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + if runnable is None or runnable.sub is None: + raise DmlRepoError("batch executor start requires runnable with sub runnable") + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, _ADAPTER_IO_NAME) + payload = AdapterBase._dump_payload( + runnable=runnable.sub, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=None, + ) + io.write_input(payload) + client = self._client() + reqs, job_queue = self._resource_requirements(runnable.kwargs) + image = self._image_uri(runnable.kwargs.get("image")) + job_name = f"dml-batch-{cache_key}" + job_def = client.register_job_definition( + jobDefinitionName=job_name, + type="container", + containerProperties={ + "image": image, + "command": [runnable.sub.adapter, "--poll", "-i", io.input_uri, "-o", io.output_uri], + "environment": [], + "jobRoleArn": self._string("BATCH_TASK_ROLE_ARN", os.environ.get("BATCH_TASK_ROLE_ARN")), + "resourceRequirements": reqs, + }, + )["jobDefinitionArn"] + job_id = client.submit_job(jobName=job_name, jobQueue=job_queue, jobDefinition=job_def)["jobId"] + return { + "status": "running", + "error": None, + "state": { + "job_id": job_id, + "job_definition": job_def, + }, + } + + def poll( + self, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any], + remote: dict[str, str], + ) -> dict[str, Any]: + job_id = state.get("job_id") + if not isinstance(job_id, str) or not job_id: + return {"status": "failed", "error": "batch poll: missing job_id in job state"} + try: + jobs = self._client().describe_jobs(jobs=[job_id]).get("jobs", []) + except Exception: + return {"status": "running", "error": None, "state": state} + if not jobs: + return {"status": "running", "error": None, "state": state} + job = jobs[0] + job_status = job["status"] + + if job_status in PENDING_BATCH_STATUSES: + return {"status": "running", "error": None, "state": state} + + if job_status == "SUCCEEDED": + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, _ADAPTER_IO_NAME) + try: + raw = io.read_output() + if raw is None: + return {"status": "failed", "error": "batch poll: sub-adapter output not yet written to S3"} + result = json.loads(raw) + except Exception as e: + return {"status": "failed", "error": f"batch poll: could not read sub-adapter result: {e}"} + if not isinstance(result, dict) or result.get("status") not in {"succeeded", "failed"}: + return {"status": "failed", "error": f"batch poll: unexpected sub-adapter result: {result}"} + return result + + # Failed + reason = None + if isinstance(job.get("statusReason"), str) and job["statusReason"]: + reason = job["statusReason"] + attempts = job.get("attempts") or [{}] + container = attempts[-1].get("container", {}) if attempts else {} + if isinstance(container, dict): + reason = container.get("reason") or container.get("exitCode") or reason + error = f"Batch job {job_id} failed" + if reason not in {None, ""}: + error = f"{error}: {reason}" + return {"status": "failed", "error": error} + + def cancel( + self, *, cache_key: str, execution_id: str, state: dict[str, Any], remote: dict[str, str] + ) -> dict[str, Any]: + del cache_key, execution_id, remote + client = self._client() + job_id = state.get("job_id") + job_definition = state.get("job_definition") + if isinstance(job_id, str) and job_id: + try: + client.cancel_job(jobId=job_id, reason="daggerml cancellation requested") + except Exception: + try: + client.terminate_job(jobId=job_id, reason="daggerml cancellation requested") + except Exception: + pass + if isinstance(job_definition, str) and job_definition: + try: + client.deregister_job_definition(jobDefinition=job_definition) + except Exception: + pass + return {"status": "cancel-detached", "error": None} diff --git a/src/daggerml/contrib/executors/cfn.py b/src/daggerml/contrib/executors/cfn.py new file mode 100644 index 0000000..41e2fd4 --- /dev/null +++ b/src/daggerml/contrib/executors/cfn.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import json +from contextlib import contextmanager +from typing import Any +from uuid import uuid4 + +from daggerml import new, temporary +from daggerml._internal import Runnable +from daggerml.contrib.executors._base import ExecutorBase +from daggerml.util import get_client + +TERMINAL_FAILED_STATUSES = { + "CREATE_FAILED", + "ROLLBACK_COMPLETE", + "ROLLBACK_FAILED", + "DELETE_FAILED", + "UPDATE_ROLLBACK_COMPLETE", + "UPDATE_ROLLBACK_FAILED", +} +TERMINAL_SUCCESS_STATUSES = {"CREATE_COMPLETE", "UPDATE_COMPLETE"} + + +class CfnExecutor(ExecutorBase): + name = "cfn" + adapter = "local" + + @staticmethod + def _client(): + return get_client("cloudformation") + + @classmethod + @contextmanager + def _tmpdag(cls, argv_ptr, *, remote_root: str): + with temporary(remote_root=remote_root, name=f"cfn-{uuid4().hex}") as dml: + with new(dml=dml, argv_ptr=argv_ptr) as dag: + yield dag + + @classmethod + def _commit_dag(cls, metadata, stack, outputs, *, remote_root: str): + argv_ptr = metadata.get("argv_ptr") + with cls._tmpdag(argv_ptr, remote_root=remote_root) as dag: + for k, v in outputs.items(): + dag[k] = v + dag.stack_id = stack["StackId"] + dag.stack_name = metadata["stack_name"] + dag.outputs = outputs + dag.commit(dag.outputs) + return dag.ref.id() + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + del runnable + with temporary(remote_root=remote["root"], name=f"cfn-{execution_id}") as dml_inst: + with new(dml=dml_inst, argv_ptr=argv_ptr) as dag: + name, template, params = dag.argv[1:4].value() + + client = self._client() + old_stack_id = None + stack_id = None + return_poll = False + try: + stacks = client.describe_stacks(StackName=name)["Stacks"] + old_stack_id = stacks[0]["StackId"] if stacks else None + except Exception: + pass + try: + if old_stack_id is None: + resp = client.create_stack( + StackName=name, + TemplateBody=json.dumps(template), + Parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()], + Capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], + ) + stack_id = resp["StackId"] + else: + resp = client.update_stack( + StackName=name, + TemplateBody=json.dumps(template), + Parameters=[{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()], + Capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM"], + ) + stack_id = resp["StackId"] + except Exception as e: + if "No updates are to be performed" not in str(e): + raise + stack_id = old_stack_id + return_poll = True + + job_state = {"stack_name": name, "stack_id": stack_id, "argv_ptr": argv_ptr} + + if return_poll: + return self.poll(cache_key=cache_key, execution_id=execution_id, state=job_state, remote=remote) + return {"status": "running", "error": None, "state": job_state} + + def poll( + self, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any], + remote: dict[str, str], + ) -> dict[str, Any]: + del cache_key, execution_id + stack_name = state.get("stack_name") + if not stack_name: + return {"status": "failed", "error": "cfn poll: missing stack_name in job state"} + try: + stacks = self._client().describe_stacks(StackName=stack_name)["Stacks"] + except Exception: + return {"status": "running", "error": None, "state": state} + if not stacks: + return {"status": "failed", "error": f"Stack not found: {stack_name}"} + stack = stacks[0] + raw_status = stack["StackStatus"] + if raw_status in TERMINAL_SUCCESS_STATUSES: + outputs = {o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])} + dag_id = self._commit_dag(state, stack, outputs, remote_root=remote["root"]) + return {"status": "succeeded", "error": None, "dag_id": dag_id} + if raw_status in TERMINAL_FAILED_STATUSES: + error = f"Stack {stack_name} failed: {raw_status}" + try: + events = self._client().describe_stack_events(StackName=stack_name)["StackEvents"] + reasons = [e["ResourceStatusReason"] for e in events if "ResourceStatusReason" in e] + if reasons: + error = f"{error}\n{chr(10).join(reasons)}" + except Exception: + pass + return {"status": "failed", "error": error} + return {"status": "running", "error": None, "state": state} + + def cancel( + self, *, cache_key: str, execution_id: str, state: dict[str, Any], remote: dict[str, str] + ) -> dict[str, Any]: + del cache_key, execution_id, remote + stack_name = state.get("stack_name") + if isinstance(stack_name, str) and stack_name: + client = self._client() + try: + client.cancel_update_stack(StackName=stack_name) + except Exception: + try: + client.delete_stack(StackName=stack_name) + except Exception: + pass + return {"status": "cancel-detached", "error": None} diff --git a/src/daggerml/contrib/executors/docker.py b/src/daggerml/contrib/executors/docker.py new file mode 100644 index 0000000..1d9bfe0 --- /dev/null +++ b/src/daggerml/contrib/executors/docker.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +import tarfile +import tempfile +from pathlib import Path +from typing import Any, cast + +from daggerml import Uri +from daggerml._internal import DmlRepoError, ExecutionState, Runnable +from daggerml.contrib.executors._base import ExecutorBase +from daggerml.contrib.s3 import S3Store, is_s3_uri + + +class DockerExecutor(ExecutorBase): + name = "docker" + adapter = "local" + + @classmethod + def resolve_runnable(cls, uri, kwargs, sub): + if sub is None: + raise DmlRepoError("docker executor requires sub runnable") + image = kwargs.get("image") + if image is None: + raise DmlRepoError("docker executor requires image") + unknown = sorted(set(kwargs.keys()) - {"image", "flags"}) + if unknown: + raise DmlRepoError(f"Unknown docker executor kwargs: {', '.join(unknown)}") + return Runnable( + target=Uri("docker"), + kwargs={"image": image, "flags": kwargs.get("flags", [])}, + sub=sub, + adapter="dml-local-adapter", + ) + + @staticmethod + def _run_docker(*args: str, check: bool = True, docker_bin: str | None = None) -> str: + docker_bin = docker_bin or shutil.which("docker") + if docker_bin is None: + raise DmlRepoError("docker executable not found in PATH") + proc = subprocess.run([docker_bin, *args], check=False, capture_output=True, text=True) + if proc.returncode == 0: + return proc.stdout.strip() or proc.stderr.strip() + if check: + command = f"{docker_bin} {' '.join(args)}" + raise DmlRepoError( + f"docker command failed ({proc.returncode}): {command}\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}" + ) + return proc.stdout.strip() or proc.stderr.strip() + + @staticmethod + def _encode_value(value: Any) -> Any: + if isinstance(value, Uri): + return value.uri + if isinstance(value, Runnable): + return DockerExecutor._encode_runnable(value) + if isinstance(value, dict): + return {k: DockerExecutor._encode_value(v) for k, v in value.items()} + if isinstance(value, list): + return [DockerExecutor._encode_value(v) for v in value] + if isinstance(value, tuple): + return [DockerExecutor._encode_value(v) for v in value] + return value + + @staticmethod + def _encode_runnable(runnable: Runnable) -> dict[str, Any]: + return { + "target": runnable.target.uri, + "adapter": runnable.adapter, + "kwargs": DockerExecutor._encode_value(runnable.kwargs), + "sub": None if runnable.sub is None else DockerExecutor._encode_runnable(runnable.sub), + } + + @staticmethod + def _image_input(runnable: Runnable) -> str: + image = runnable.kwargs.get("image") + if hasattr(image, "value") and callable(image.value): + image = image.value() + if isinstance(image, Uri): + return image.uri + if isinstance(image, str) and image: + return image + raise DmlRepoError("docker executor image must resolve to a non-empty Uri or string") + + @staticmethod + def _image_tag_from_tar(tar_path: Path) -> str: + with tarfile.open(tar_path, mode="r") as tf: + member = tf.extractfile("manifest.json") + if member is None: + raise DmlRepoError("docker image tar missing manifest.json") + manifest = json.loads(member.read()) + repo_tags = manifest[0].get("RepoTags") if manifest else None + if not isinstance(repo_tags, list) or not repo_tags or not isinstance(repo_tags[0], str) or not repo_tags[0]: + raise DmlRepoError("docker image tar missing RepoTags") + return cast(str, repo_tags[0]) + + @staticmethod + def _prepare_image(runnable: Runnable, workdir: Path, remote: dict[str, Any]) -> tuple[str, str | None]: + image = DockerExecutor._image_input(runnable) + if not is_s3_uri(image): + return image, None + tar_path = workdir / "image.tar" + store = S3Store.from_remote_root(cast(str, remote["root"])) + tar_path.write_bytes(store.get(image)) + image_ref = DockerExecutor._image_tag_from_tar(tar_path) + DockerExecutor._run_docker("load", "-i", str(tar_path)) + return image_ref, image_ref + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + if runnable.sub is None: + raise DmlRepoError("docker executor requires sub runnable") + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, "local:docker") + + workdir = Path(tempfile.mkdtemp(prefix=f"dml-docker-{execution_id}-")) + try: + image_ref, cleanup_image = self._prepare_image(runnable, workdir, remote) + finally: + shutil.rmtree(workdir, ignore_errors=True) + + payload: dict[str, Any] = { + "runnable": self._encode_runnable(runnable.sub), + "argv_ptr": argv_ptr, + "cache_key": cache_key, + "execution_id": execution_id, + "remote": remote, + "state": None, + } + input_uri = io.write_input(json.dumps(payload, separators=(",", ":"), sort_keys=True).encode("utf-8")) + + container_id = self._run_docker( + "run", + "-d", + *cast(list[str], runnable.kwargs.get("flags", [])), + "-e", + f"DML_REMOTE_ROOT={remote['root']}", + image_ref, + runnable.sub.adapter, + "--poll", + "-i", + input_uri, + "-o", + io.output_uri, + ) + + return { + "status": "running", + "error": None, + "state": { + "container_id": container_id, + "cleanup_image": cleanup_image, + }, + } + + def poll( + self, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any], + remote: dict[str, str], + ) -> dict[str, Any]: + container_id = state.get("container_id") + + if not isinstance(container_id, str) or not container_id: + return {"status": "failed", "error": "docker poll: missing container_id in job state"} + + docker_bin = shutil.which("docker") + if docker_bin is None: + return {"status": "failed", "error": "docker poll: docker executable not found"} + + proc = subprocess.run( + [docker_bin, "inspect", "--format", "{{.State.Status}}", container_id], + check=False, + capture_output=True, + text=True, + ) + if proc.returncode != 0: + container_status = "exited" + else: + container_status = proc.stdout.strip() + + if container_status in ("created", "running", "paused", "restarting"): + return {"status": "running", "error": None, "state": state} + + # Container exited + _cleanup_docker(container_id, state.get("cleanup_image"), docker_bin) + + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, "local:docker") + raw = io.read_output() + if raw is not None: + try: + result = json.loads(raw) + if isinstance(result, dict) and result.get("status") in {"succeeded", "failed"}: + return result + except Exception as e: + return {"status": "failed", "error": f"docker poll: could not read output: {e}"} + + return {"status": "failed", "error": f"docker container {container_id} exited without output"} + + def cancel( + self, *, cache_key: str, execution_id: str, state: dict[str, Any], remote: dict[str, str] + ) -> dict[str, Any]: + del cache_key, execution_id, remote + docker_bin = shutil.which("docker") + if docker_bin is None: + return {"status": "cancel-detached", "error": None} + container_id = state.get("container_id") + if isinstance(container_id, str) and container_id: + _cleanup_docker(container_id, state.get("cleanup_image"), docker_bin) + return {"status": "cancel-detached", "error": None} + + +def _cleanup_docker(container_id: str, cleanup_image: str | None, docker_bin: str) -> None: + subprocess.run([docker_bin, "rm", "-f", container_id], check=False, capture_output=True, text=True) + if isinstance(cleanup_image, str) and cleanup_image: + subprocess.run([docker_bin, "image", "rm", "-f", cleanup_image], check=False, capture_output=True, text=True) diff --git a/src/daggerml/contrib/executors/script.py b/src/daggerml/contrib/executors/script.py new file mode 100644 index 0000000..af3ed8b --- /dev/null +++ b/src/daggerml/contrib/executors/script.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import argparse +import ast +import inspect +import json +import logging +import os +import shutil +import signal +import subprocess +import sys +import tempfile +from contextlib import chdir +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import TYPE_CHECKING, Any, cast + +import daggerml as dml +from daggerml._internal import DmlRepoError, Runnable, Uri, execution_context +from daggerml.contrib.executors._base import ExecutorBase +from daggerml.contrib.s3 import S3Store + +if TYPE_CHECKING: + from daggerml import Dag + +logger = logging.getLogger(__name__) + + +META_KEY = "__dml_script_exec__" + + +class ScriptExecutor(ExecutorBase): + name = "script" + adapter = "local" + + def __init__(self, runnable: Runnable | None = None, argv_ptr: str | None = None): + self.runnable = runnable + self.argv_ptr = argv_ptr + + @staticmethod + def _script_kwargs(kwargs: dict[str, Any]) -> tuple[dict[str, Any], str]: + allowed = {"fn", "prepop", "extra_objs", "extra_lines"} + unknown = sorted(set(kwargs.keys()) - allowed) + if unknown: + bad = ", ".join(unknown) + raise DmlRepoError(f"Unknown script executor kwargs: {bad}") + fn = kwargs.get("fn") + if not callable(fn): + raise DmlRepoError("script resolve_runnable requires callable fn") + prepop = kwargs.get("prepop", {}) + if not isinstance(prepop, dict): + raise DmlRepoError("script prepop must be a dict") + extra_objs = list(kwargs.get("extra_objs", [])) + if not isinstance(extra_objs, list): + raise DmlRepoError(f"script extra_objs must be a list, not {type(extra_objs).__name__}") + extra_lines = list(kwargs.get("extra_lines", [])) + if not isinstance(extra_lines, list) or not all(isinstance(x, str) for x in extra_lines): + raise DmlRepoError("script extra_lines must be a list[str]") + call_kwargs = {} + params = list(inspect.signature(fn).parameters.values()) + if not params or params[0].name != "dag": + raise DmlRepoError("script fn must include first 'dag' parameter") + for p in params[1:]: + has_default = p.default is not inspect._empty + if has_default: + call_kwargs[p.name] = p.default + script = ScriptExecutor._render_script(fn, extra_objs=extra_objs, extra_lines=extra_lines) + return { + META_KEY: { + "prepop": prepop, + "fn_name": fn.__name__, + }, + **call_kwargs, + }, script + + @staticmethod + def _strip_funkify_decorators(source: str) -> str: + module = ast.parse(source) + for node in module.body: + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + node.decorator_list = [] + return ast.unparse(module).strip() + + @staticmethod + def _render_script(fn, *, extra_objs: list[Any], extra_lines: list[str]) -> str: + chunks: list[str] = [] + for obj in [*extra_objs, fn]: + try: + raw = dedent(inspect.getsource(inspect.unwrap(obj))).strip() + chunks.append(ScriptExecutor._strip_funkify_decorators(raw)) + except (OSError, TypeError) as e: + raise DmlRepoError(f"Failed to serialize object source: {e}") from e + + if extra_lines: + chunks.extend(extra_lines) + + script = "\n".join(["\n\n".join(chunks), "\n"]) + try: + mod = ast.parse(script) + except SyntaxError as e: + raise DmlRepoError(f"Generated script is not valid Python: {e}") from e + + if not any(isinstance(n, ast.FunctionDef) and n.name == fn.__name__ for n in mod.body): + raise DmlRepoError(f"Function '{fn.__name__}' is not globally defined in generated script") + + return script + + @classmethod + def resolve_runnable(cls, uri, kwargs, sub): + if sub is not None: + raise DmlRepoError("script executor does not accept sub runnable") + resolved_kwargs, script = cls._script_kwargs(dict(kwargs)) + script_uri = S3Store().put(data=script.encode("utf-8"), suffix=".py") + meta = dict(resolved_kwargs[META_KEY]) + meta["script_uri"] = script_uri.uri + return Runnable( + target=Uri("script"), + kwargs={**resolved_kwargs, META_KEY: meta}, + sub=sub, + adapter="dml-local-adapter", + ) + + def start( + self, + *, + cache_key: str, + execution_id: str, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + workdir = Path(tempfile.mkdtemp(prefix=f"dml-script-{execution_id[:8]}-")) + payload_path = workdir / "supervisor-input.json" + result_path = workdir / "result.json" + stdout_path = workdir / "stdout.log" + stderr_path = workdir / "stderr.log" + payload = { + "version": 0, + "cache_key": cache_key, + "execution_id": execution_id, + "cmd": [ + sys.executable, + "-m", + "daggerml.contrib.executors.script", + "--execution-id", + execution_id, + "--cache-key", + cache_key, + "--remote-root", + remote["root"], + argv_ptr, + ], + "remote": remote, + "env": {}, + } + payload_path.write_text(json.dumps(payload, separators=(",", ":"), sort_keys=True)) + with stdout_path.open("w") as stdout_f, stderr_path.open("w") as stderr_f: + proc = subprocess.Popen( + [ + sys.executable, + "-m", + "daggerml.contrib.supervisor", + "-i", + str(payload_path), + "-o", + str(result_path), + ], + stdout=stdout_f, + stderr=stderr_f, + start_new_session=True, + close_fds=True, + env={**os.environ, "PYTHONUNBUFFERED": "1", **payload["env"]}, + ) + launch_state = { + "pid": proc.pid, + "workdir": str(workdir), + "result_path": str(result_path), + "stdout_path": str(stdout_path), + "stderr_path": str(stderr_path), + } + return {"status": "running", "error": None, "state": launch_state} + + def poll( + self, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any], + remote: dict[str, str], + ) -> dict[str, Any]: + del cache_key, execution_id, remote + result_path = Path(state.get("result_path", "")) + pid = state.get("pid") + + # Polls may run either in the launching adapter process or in a later + # process. Reap children when we can; otherwise fall back to a direct + # PID probe for cross-process polling. + if isinstance(pid, int): + try: + done_pid, _ = os.waitpid(pid, os.WNOHANG) + if done_pid == 0: + return {"status": "running", "error": None, "state": state} + except ChildProcessError: + try: + os.kill(pid, 0) + return {"status": "running", "error": None, "state": state} + except ProcessLookupError: + pass + except PermissionError: + return {"status": "running", "error": None, "state": state} + + # Process exited — read result + if result_path.exists(): + try: + parsed = json.loads(result_path.read_text()) + if isinstance(parsed, dict) and parsed.get("status") in {"succeeded", "failed"}: + _cleanup_workdir(state) + return parsed + except Exception as e: + _cleanup_workdir(state) + return {"status": "failed", "error": f"Could not read supervisor result: {e}"} + + _cleanup_workdir(state) + return {"status": "failed", "error": "Script supervisor exited without result"} + + def cancel( + self, *, cache_key: str, execution_id: str, state: dict[str, Any], remote: dict[str, str] + ) -> dict[str, Any]: + del cache_key, execution_id, remote + pid = state.get("pid") + if isinstance(pid, int): + try: + os.killpg(pid, signal.SIGTERM) + except ProcessLookupError: + pass + except PermissionError: + pass + _cleanup_workdir(state) + return {"status": "cancel-detached", "error": None} + + +def _cleanup_workdir(launch_state: dict[str, Any]) -> None: + workdir = launch_state.get("workdir") + if isinstance(workdir, str) and workdir: + shutil.rmtree(workdir, ignore_errors=True) + + +def _terminal_runnable(root: Runnable) -> Runnable: + current = root + while current.sub is not None: + current = current.sub + return current + + +def run_payload(argv_ptr: str, *, execution_id: str, cache_key: str, remote_root: str) -> dict[str, Any]: + namespace: dict[str, Any] = {"logger": logging.getLogger("daggerml.contrib.script")} + + def runit(dag): + runnable_node, *arg_nodes = dag.argv + runnable = _terminal_runnable(cast(Runnable, runnable_node.value())) + metadata = cast(dict[str, Any], runnable.kwargs.pop(META_KEY)) + script_uri = cast(str, metadata["script_uri"]) + script = S3Store().get(script_uri).decode("utf-8") + fn_name = cast(str, metadata["fn_name"]) + call_kwargs = {k: dag.put(v, name=f"dml.kw:{k}") for k, v in runnable.kwargs.items()} + prepop = cast(dict[str, Any], metadata.get("prepop", {})) + for key, value in prepop.items(): + dag.put(value, name=key) + exec(script, namespace) + fn = namespace.get(fn_name) + output = fn(dag, *arg_nodes, **call_kwargs) + if dag.ref is None: + dag.commit(output) + + def succeeded_result(dag: "Dag") -> dict[str, Any]: + if dag.ref is None: + raise DmlRepoError("Script worker succeeded without committed DAG") + return {"status": "succeeded", "error": None, "dag_id": dag.ref.id()} + + with execution_context(execution_id, cache_key): + with dml.temporary(remote_root=remote_root) as dml_instance: + try: + dag = dml.new(dml=dml_instance, argv_ptr=argv_ptr) + except Exception as e: + return {"status": "failed", "error": str(e)} + with TemporaryDirectory(prefix="dml-script-worker-") as tmpd, chdir(tmpd): + try: + with dag: + runit(dag) + return succeeded_result(dag) + except Exception as e: + if dag.ref is not None: + return succeeded_result(dag) + return {"status": "failed", "error": str(e)} + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="daggerml script worker") + parser.add_argument("-o", "--output", default="result.json", help="JSON result path or '-' for stdout") + parser.add_argument("--execution-id", required=True) + parser.add_argument("--cache-key", required=True) + parser.add_argument("--remote-root", required=True) + parser.add_argument("argv_ptr") + args = parser.parse_args(argv or sys.argv[1:]) + result = run_payload( + args.argv_ptr, + execution_id=args.execution_id, + cache_key=args.cache_key, + remote_root=args.remote_root, + ) + encoded = json.dumps(result, separators=(",", ":"), sort_keys=True) + if args.output == "-": + sys.stdout.write(encoded) + if not encoded.endswith("\n"): + sys.stdout.write("\n") + else: + Path(args.output).write_text(encoded) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/daggerml/contrib/executors/ssh.py b/src/daggerml/contrib/executors/ssh.py new file mode 100644 index 0000000..004f105 --- /dev/null +++ b/src/daggerml/contrib/executors/ssh.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import json +import logging +import shlex +import subprocess +from typing import Any, TypedDict, cast + +from daggerml import Uri +from daggerml._internal import DmlRepoError, Runnable +from daggerml.contrib.adapters import AdapterBase +from daggerml.contrib.api import is_node_like +from daggerml.contrib.executors._base import ExecutorBase + +SshExecKwargs = TypedDict("SshExecutorKwargs", {"host": str, "flags": list[str], "env_files": list[str]}) + +logger = logging.getLogger(__name__) + + +def _is_node_string_list(value: Any) -> bool: + return is_node_like(value) or isinstance(value, list) and all(isinstance(item, str) and item for item in value) + + +class SshExecutor(ExecutorBase): + name = "ssh" + adapter = "local" + + @classmethod + def handle( + cls, + *, + cache_key: str, + execution_id: str, + state: dict[str, Any] | None, + execution_status: str | None = None, + cancel_requested_by: str | None = None, + runnable: Runnable, + argv_ptr: str, + remote: dict[str, str], + ) -> dict[str, Any]: + if runnable is None or runnable.sub is None: + raise DmlRepoError("ssh executor handle requires runnable with sub runnable") + kw = cls._validate_kw(runnable.kwargs) + cmd = [ + "ssh", + *kw["flags"], + kw["host"], + cls._remote_command(env_files=kw["env_files"], adapter=runnable.sub.adapter), + ] + payload = AdapterBase._dump_payload( + runnable=runnable.sub, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + state=state, + execution_status=execution_status, + cancel_requested_by=cancel_requested_by, + ) + logger.debug( + "ssh executor launch host=%s flags=%s env_files=%s adapter=%s cache_key=%s execution_id=%s has_state=%s", + kw["host"], + kw["flags"], + kw["env_files"], + runnable.sub.adapter, + cache_key, + execution_id, + state is not None, + ) + proc = subprocess.run(cmd, input=payload, capture_output=True, check=False) + stdout = proc.stdout.decode("utf-8", errors="replace").strip() + stderr = proc.stderr.decode("utf-8", errors="replace").strip() + logger.debug( + "ssh executor command returncode=%s execution_id=%s stdout=%r stderr=%r", + proc.returncode, + execution_id, + stdout, + stderr, + ) + if proc.returncode != 0: + error = f"SSH command failed ({proc.returncode})" + if stderr: + error = f"{error}: {stderr}" + elif stdout: + error = f"{error}: {stdout}" + logger.debug("ssh executor transport failed execution_id=%s error=%s", execution_id, error) + return {"status": "failed", "error": error} + try: + result = json.loads(stdout) + except json.JSONDecodeError as e: + logger.debug( + "ssh executor invalid json execution_id=%s error=%s stdout=%r", + execution_id, + e, + stdout, + ) + return {"status": "failed", "error": f"SSH nested adapter returned invalid JSON: {e}"} + if not isinstance(result, dict) or result.get("status") not in { + "succeeded", + "failed", + "running", + "cancel-detached", + }: + logger.debug("ssh executor unexpected result execution_id=%s result=%r", execution_id, result) + return {"status": "failed", "error": f"SSH nested adapter returned unexpected result: {result}"} + logger.debug( + "ssh executor result execution_id=%s status=%s error=%r", + execution_id, + result.get("status"), + result.get("error"), + ) + return result + + @staticmethod + def _validate_kw(kw: dict) -> SshExecKwargs: + if not isinstance(kw, dict): + raise DmlRepoError("ssh executor kwargs must be a dict") + if set(kw.keys()) > {"env_files", "flags", "host"}: + raise DmlRepoError("ssh executor kwargs only supports keys: env_files, flags, host") + host = cast(str, kw.get("host")) + if not (is_node_like(host) or (isinstance(host, str) and host)): + raise DmlRepoError("ssh executor requires non-empty host") + kw["flags"] = flags = cast(list[str], kw.get("flags") or []) + if not _is_node_string_list(flags): + raise DmlRepoError("ssh executor flags must be a list of non-empty strings") + kw["env_files"] = env_files = cast(list[str], kw.get("env_files") or []) + if not _is_node_string_list(env_files): + raise DmlRepoError("ssh executor env_files must be a list of non-empty strings") + return SshExecKwargs(host=host, flags=flags, env_files=env_files) + + @classmethod + def resolve_runnable(cls, uri, kwargs, sub): + if sub is None: + raise DmlRepoError("ssh executor requires sub runnable") + unknown = sorted(set(kwargs.keys()) - {"env_files", "flags", "host"}) + if unknown: + raise DmlRepoError(f"Unknown ssh executor kwargs: {', '.join(unknown)}") + return Runnable( + target=Uri("ssh"), + kwargs=dict(cls._validate_kw(kwargs)), + sub=sub, + adapter="dml-local-adapter", + ) + + @staticmethod + def _remote_command(*, env_files: list[str], adapter: str) -> str: + parts = ["set -e"] + parts.extend(f". {shlex.quote(path)}" for path in env_files) + parts.append(f"exec {shlex.quote(adapter)} --poll -i - -o -") + return "; ".join(parts) diff --git a/src/daggerml/contrib/funks.py b/src/daggerml/contrib/funks.py new file mode 100644 index 0000000..cdcbcfc --- /dev/null +++ b/src/daggerml/contrib/funks.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from daggerml import Dag, Node, Runnable, Uri, new +from daggerml.contrib import api + + +def _run(*cmd: str) -> None: + import subprocess + + from daggerml._internal import DmlRepoError + + proc = subprocess.run(cmd, check=False, capture_output=True, text=True) + if proc.returncode == 0: + return + raise DmlRepoError( + f"Command failed ({proc.returncode}): {' '.join(cmd)}\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}" + ) + + +@api.funkify(uri="script", adapter="local", extra_objs=(_run,)) +def docker_build(dag, context_tarball, build_flags=(), repo=None): + from uuid import uuid4 + + from daggerml import Uri + from daggerml.contrib.s3 import S3Store + + build_flags = tuple(build_flags.value()) + + store = S3Store() + tag = uuid4().hex + local_image = f"dml:{tag}" + store.untar(context_tarball.value(), ".") + _run("docker", "build", *build_flags, "-t", local_image, ".") + repo = repo.value() if repo is not None else None + if repo is not None: + remote_image = f"{repo.uri}:{tag}" + _run("docker", "tag", local_image, remote_image) + _run("docker", "push", remote_image) + return dag.put(Uri(remote_image), name="remote-image") + image_tar = "./image.tar" + _run("docker", "save", "-o", str(image_tar), local_image) + return store.put(filepath=str(image_tar), suffix=".tar") + + +def cfn(template: dict, params: dict, name: str, dag: Dag | None = None) -> Node: + if dag is None: + with new(name=f"cfn:{name}") as dag: + return cfn(template=template, params=params, name=name, dag=dag) + dag.cfn_fn = Runnable(target=Uri("cfn"), adapter="dml-local-adapter", kwargs={}, sub=None) + stack = dag.cfn_fn(name, template, params, name=f"cfn:{name}") + return stack + + +__all__ = ["docker_build"] diff --git a/src/daggerml/contrib/s3.py b/src/daggerml/contrib/s3.py new file mode 100644 index 0000000..b449d7b --- /dev/null +++ b/src/daggerml/contrib/s3.py @@ -0,0 +1,254 @@ +from __future__ import annotations + +import fnmatch +import hashlib +import io +import json +import os +import tarfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Literal, cast +from urllib.parse import urlparse + +import boto3 + +from daggerml import Dml, Node, Uri +from daggerml._internal import DmlRepoError + + +def is_s3_uri(value: str) -> bool: + p = urlparse(value) + return p.scheme == "s3" and bool(p.netloc) and bool(p.path and p.path != "/") + + +def _boto3_client(service: str): + return boto3.client(service) + + +def _sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _flatten_names(*name_or_uris): + if len(name_or_uris) == 1 and isinstance(name_or_uris[0], (list, tuple)): + return list(name_or_uris[0]) + return list(name_or_uris) + + +def _validate_safe_extract_path(*, dest_path: Path, member_name: str) -> None: + member_path = Path(member_name) + if member_path.is_absolute(): + raise DmlRepoError(f"Refusing to extract absolute tar path: {member_name}") + target_path = (dest_path / member_path).resolve() + if os.path.commonpath([str(dest_path), str(target_path)]) != str(dest_path): + raise DmlRepoError(f"Refusing to extract path outside destination: {member_name}") + + +@dataclass(frozen=True) +class S3Store: + bucket: str | None = None + prefix: str | None = None + client: Any = None + + def __post_init__(self): + bucket = self.bucket + prefix = self.prefix + if bucket is None and prefix is None: + remote_root = Dml().config.show()["remote"]["root"] + if not remote_root: + raise DmlRepoError( + "S3Store requires configured remote.root (set DML_REMOTE_ROOT or pass bucket/prefix)" + ) + p = urlparse(remote_root) + if p.scheme != "s3" or not p.netloc: + raise DmlRepoError("remote.root must be an s3:// URI") + bucket = p.netloc + base = p.path.lstrip("/").rstrip("/") + prefix = f"{base}/data" if base else "data" + if bucket is None: + raise DmlRepoError("S3Store bucket not configured") + if prefix is None: + prefix = "" + object.__setattr__(self, "bucket", bucket) + object.__setattr__(self, "prefix", prefix.strip("/")) + object.__setattr__(self, "client", self.client or _boto3_client("s3")) + + @classmethod + def from_remote_root(cls, remote_root: str) -> "S3Store": + p = urlparse(remote_root) + if p.scheme != "s3" or not p.netloc: + raise DmlRepoError("remote root must be an s3:// URI") + base = p.path.lstrip("/").rstrip("/") + prefix = f"{base}/data" if base else "data" + return cls(bucket=p.netloc, prefix=prefix) + + def parse_uri(self, name_or_uri) -> tuple[str, str]: + if isinstance(name_or_uri, Node): + name_or_uri = name_or_uri.value() + if isinstance(name_or_uri, Uri): + name_or_uri = name_or_uri.uri + if not isinstance(name_or_uri, str): + raise DmlRepoError("S3Store name_or_uri must be a string or uri-bearing object") + p = urlparse(name_or_uri) + if p.scheme == "s3": + return p.netloc, p.path[1:] + if self.bucket is None: + raise DmlRepoError("S3Store bucket not configured") + key = f"{self.prefix}/{name_or_uri}" if self.prefix else name_or_uri + return cast(str, self.bucket), key + + def _name2uri(self, name) -> Uri: + bucket, key = self.parse_uri(name) + return Uri(f"s3://{bucket}/{key}") + + def put(self, data: bytes | None = None, filepath: str | None = None, *, suffix: str = "") -> Uri: + if (data is None) == (filepath is None): + raise DmlRepoError("S3Store.put requires exactly one of data or filepath") + if data is None: + # filepath is not None from previous check + assert filepath is not None + data = Path(filepath).read_bytes() + name = _sha256_bytes(data) + suffix + bucket, key = self.parse_uri(name) + self.client.put_object(Bucket=bucket, Key=key, Body=data) + return Uri(f"s3://{bucket}/{key}") + + def get(self, name_or_uri) -> bytes: + bucket, key = self.parse_uri(name_or_uri) + obj = self.client.get_object(Bucket=bucket, Key=key) + return obj["Body"].read() + + def exists(self, name_or_uri) -> bool: + bucket, key = self.parse_uri(name_or_uri) + try: + self.client.head_object(Bucket=bucket, Key=key) + return True + except Exception as e: + code = getattr(e, "response", {}).get("Error", {}).get("Code") + if code in {"404", "NoSuchKey", "NotFound"}: + return False + raise + + def ls(self, s3_root=None, *, recursive: bool = False, lazy: bool = False): + bucket, prefix = self.parse_uri(s3_root or self._name2uri("")) + if prefix: + prefix = prefix.rstrip("/") + "/" + kw: dict[str, Any] = {} + if not recursive: + kw["Delimiter"] = "/" + paginator = self.client.get_paginator("list_objects_v2") + + def _iter(): + for page in paginator.paginate(Bucket=bucket, Prefix=prefix, **kw): + for obj in page.get("Contents", []): + yield Uri(f"s3://{bucket}/{obj['Key']}") + + out = _iter() + if lazy: + return out + return list(out) + + def rm(self, *name_or_uris): + values = _flatten_names(*name_or_uris) + if not values: + return + grouped: dict[str, list[str]] = {} + for item in values: + bucket, key = self.parse_uri(item) + grouped.setdefault(bucket, []).append(key) + for bucket, keys in grouped.items(): + for i in range(0, len(keys), 1000): + batch = keys[i : i + 1000] + self.client.delete_objects(Bucket=bucket, Delete={"Objects": [{"Key": k} for k in batch]}) + + def put_js(self, data: Any) -> Uri: + encoded = json.dumps(data, separators=(",", ":"), sort_keys=True).encode("utf-8") + return self.put(data=encoded, suffix=".json") + + def get_js(self, name_or_uri): + return json.loads(self.get(name_or_uri).decode("utf-8")) + + def tar( + self, + path: str | os.PathLike[str], + excludes: Iterable[str] = (), + *, + symlinks: Literal["ignore", "raise"] = "raise", + ) -> Uri: + root = Path(path).resolve() + if not root.exists() or not root.is_dir(): + raise DmlRepoError("S3Store.tar path must be an existing directory") + if symlinks not in {"ignore", "raise"}: + raise DmlRepoError("S3Store.tar symlinks must be 'ignore' or 'raise'") + patterns = list(excludes) + buf = io.BytesIO() + + def excluded(rel: str) -> bool: + return any(fnmatch.fnmatch(rel, pat) for pat in patterns) + + def normalize(info: tarfile.TarInfo) -> tarfile.TarInfo: + info.uid = 0 + info.gid = 0 + info.uname = "" + info.gname = "" + info.mtime = 0 + return info + + with tarfile.open(fileobj=buf, mode="w") as tf: + for dirpath, dirnames, filenames in os.walk(root): + dirpath = Path(dirpath) + rel_dir = dirpath.relative_to(root).as_posix() + + kept_dirnames = [] + for dirname in sorted(dirnames): + child = dirpath / dirname + rel = child.relative_to(root).as_posix() + if excluded(rel): + continue + if child.is_symlink(): + if symlinks == "raise": + raise DmlRepoError(f"S3Store.tar encountered symlink with symlinks='raise': {rel}") + continue + kept_dirnames.append(dirname) + dirnames[:] = kept_dirnames + + if rel_dir != ".": + tf.addfile(normalize(tf.gettarinfo(str(dirpath), arcname=rel_dir))) + + for filename in sorted(filenames): + p = dirpath / filename + rel = p.relative_to(root).as_posix() + if excluded(rel): + continue + if p.is_symlink(): + if symlinks == "raise": + raise DmlRepoError(f"S3Store.tar encountered symlink with symlinks='raise': {rel}") + continue + with p.open("rb") as f: + tf.addfile(normalize(tf.gettarinfo(str(p), arcname=rel)), fileobj=f) + return self.put(data=buf.getvalue(), suffix=".tar") + + def untar(self, tar_uri, dest: str | os.PathLike[str], *, unsafe: bool = False) -> None: + payload = self.get(tar_uri) + dest_path = Path(dest) + dest_path.mkdir(parents=True, exist_ok=True) + resolved_dest = dest_path.resolve() + with tarfile.open(fileobj=io.BytesIO(payload), mode="r") as tf: + members = tf.getmembers() + if not unsafe: + for member in members: + _validate_safe_extract_path(dest_path=resolved_dest, member_name=member.name) + tf.extractall(dest_path, members=members) + return + try: + tf.extractall(dest_path, members=members, filter="fully_trusted") + except TypeError: + tf.extractall(dest_path, members=members) + + def cd(self, new_prefix: str) -> "S3Store": + current = Path("/" + self.prefix) if self.prefix else Path("/") + next_prefix = (current / new_prefix).resolve().as_posix().lstrip("/") + if next_prefix == ".": + next_prefix = "" + return S3Store(bucket=self.bucket, prefix=next_prefix, client=self.client) diff --git a/src/daggerml/contrib/status.py b/src/daggerml/contrib/status.py new file mode 100644 index 0000000..a210835 --- /dev/null +++ b/src/daggerml/contrib/status.py @@ -0,0 +1,378 @@ +from __future__ import annotations + +import re +from collections import defaultdict +from typing import Any + +import daggerml.codecs as codec_mod +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg + + +def _diag(*, severity: str, scope: str, code: str, message: str, source: dict[str, Any], key: str | None = None): + return { + "severity": severity, + "scope": scope, + "code": code, + "message": message, + "source": source, + "key": key, + } + + +def _source(kind: str, group: str | None = None, name: str | None = None, value: str | None = None): + return { + "kind": kind, + "group": group, + "name": name, + "value": value, + } + + +def _object_path(obj: Any) -> str | None: + module = getattr(obj, "__module__", None) + qualname = getattr(obj, "__qualname__", None) + if isinstance(module, str) and module and isinstance(qualname, str) and qualname: + return f"{module}:{qualname}" + typ = type(obj) + module = getattr(typ, "__module__", None) + qualname = getattr(typ, "__qualname__", None) + if isinstance(module, str) and module and isinstance(qualname, str) and qualname: + return f"{module}:{qualname}" + return None + + +def _match_entry_point(obj: Any, entry_points: list[Any]) -> dict[str, Any] | None: + path = _object_path(obj) + if path is None: + return None + for ep in entry_points: + if ep.value == path: + return _source("entry_point", getattr(ep, "group", None) or None, ep.name, ep.value) + return None + + +def _match_codec_error_source(message: str, entry_points: list[Any]) -> dict[str, Any]: + match = re.search(r"Literal codec plugin '([^ ]+) \(([^)]+)\)' failed:", message) + if match is None: + return _source("none") + name, value = match.groups() + for ep in entry_points: + if ep.name == name and ep.value == value: + return _source("entry_point", getattr(ep, "group", None) or None, ep.name, ep.value) + return _source("entry_point", codec_mod.LITERAL_CODEC_ENTRYPOINT_GROUP, name, value) + + +def _fqn(obj: Any) -> str: + path = _object_path(obj) + if path is not None: + return path + return f"{type(obj).__module__}:{type(obj).__qualname__}" + + +def _implements(kind: str, obj: Any): + if kind == "adapter": + return { + "resolve_runnable": callable(getattr(obj, "resolve_runnable", None)), + "send": callable(getattr(obj, "send", None)), + "cli": callable(getattr(obj, "cli", None)), + } + if kind == "executor": + return { + "resolve_runnable": callable(getattr(obj, "resolve_runnable", None)), + "start": callable(getattr(obj, "start", None)), + "poll": callable(getattr(obj, "poll", None)), + "cleanup": callable(getattr(obj, "cleanup", None)), + } + return { + "can_encode": callable(getattr(obj, "can_encode", None)), + "encode": callable(getattr(obj, "encode", None)), + } + + +def _registration(kind: str, key: str, obj: Any, *, effective: bool): + return { + "key": key, + "fqn": _fqn(obj), + "effective": effective, + "implements": _implements(kind, obj), + } + + +def _collect_adapter_specs( + value: Any, *, source: dict[str, Any], out: list[tuple[str, Any, dict[str, Any]]], diagnostics: list[dict[str, Any]] +): + try: + name, spec = areg._validate_adapter_spec(value) + except Exception: + pass + else: + out.append((name, spec, source)) + return + + if isinstance(value, (list, tuple, set)): + for item in value: + _collect_adapter_specs(item, source=source, out=out, diagnostics=diagnostics) + return + + if callable(value): + try: + produced = value() + except Exception as e: + diagnostics.append( + _diag( + severity="error", + scope="adapter", + code="introspection_failed", + message=f"Adapter plugin callable failed: {e}", + source=source, + ) + ) + return + _collect_adapter_specs(produced, source=source, out=out, diagnostics=diagnostics) + return + + diagnostics.append( + _diag( + severity="error", + scope="adapter", + code="registration_invalid", + message="Adapter plugin returned invalid adapter registration", + source=source, + ) + ) + + +def _collect_executor_specs( + value: Any, *, source: dict[str, Any], out: list[tuple[str, Any, dict[str, Any]]], diagnostics: list[dict[str, Any]] +): + try: + adapter, name, spec = ereg._validate_executor_spec(value) + except Exception: + pass + else: + out.append((f"{adapter}:{name}", spec, source)) + return + + if isinstance(value, (list, tuple, set)): + for item in value: + _collect_executor_specs(item, source=source, out=out, diagnostics=diagnostics) + return + + if callable(value): + try: + produced = value() + except Exception as e: + diagnostics.append( + _diag( + severity="error", + scope="executor", + code="introspection_failed", + message=f"Executor plugin callable failed: {e}", + source=source, + ) + ) + return + _collect_executor_specs(produced, source=source, out=out, diagnostics=diagnostics) + return + + diagnostics.append( + _diag( + severity="error", + scope="executor", + code="registration_invalid", + message="Executor plugin returned invalid executor registration", + source=source, + ) + ) + + +def _collect_codec_specs( + value: Any, *, source: dict[str, Any], out: list[tuple[int, Any, dict[str, Any]]], diagnostics: list[dict[str, Any]] +): + if codec_mod._is_codec(value): + out.append((0, value, source)) + return + if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int) and codec_mod._is_codec(value[0]): + out.append((value[1], value[0], source)) + return + if isinstance(value, (list, tuple)): + for item in value: + _collect_codec_specs(item, source=source, out=out, diagnostics=diagnostics) + return + diagnostics.append( + _diag( + severity="error", + scope="codec", + code="registration_invalid", + message="Literal codec plugin returned invalid codec registration", + source=source, + ) + ) + + +def _adapter_status(diagnostics: list[dict[str, Any]]): + with areg._LOCK: + current = dict(areg._ADAPTER_SPECS) + entry_points = areg._entry_points() + candidates: list[tuple[str, Any, dict[str, Any]]] = [] + + for name, obj in current.items(): + candidates.append((name, obj, _match_entry_point(obj, entry_points) or _source("runtime"))) + + for ep in entry_points: + source = _source("entry_point", areg.ADAPTER_ENTRYPOINT_GROUP, ep.name, ep.value) + try: + loaded = ep.load() + except Exception as e: + diagnostics.append( + _diag( + severity="error", + scope="adapter", + code="entry_point_load_failed", + message=f"Adapter plugin '{ep.name} ({ep.value})' failed: {e}", + source=source, + ) + ) + continue + loaded_specs: list[tuple[str, Any, dict[str, Any]]] = [] + _collect_adapter_specs(loaded, source=source, out=loaded_specs, diagnostics=diagnostics) + candidates.extend(loaded_specs) + + grouped: dict[str, list[int]] = defaultdict(list) + for idx, (key, _obj, _src) in enumerate(candidates): + grouped[key].append(idx) + + effective_idx = {indexes[-1] for indexes in grouped.values()} + for key, indexes in grouped.items(): + if len(indexes) > 1: + diagnostics.append( + _diag( + severity="warning", + scope="adapter", + code="duplicate_key", + message=f"Multiple adapter registrations found for key '{key}'", + source=_source("runtime"), + key=key, + ) + ) + + registrations = [ + _registration("adapter", key, obj, effective=idx in effective_idx) + for idx, (key, obj, _source) in enumerate(candidates) + ] + return registrations + + +def _executor_status(diagnostics: list[dict[str, Any]]): + with ereg._LOCK: + current = dict(ereg._EXECUTOR_SPECS) + entry_points = ereg._entry_points() + candidates: list[tuple[str, Any, dict[str, Any]]] = [] + + for (adapter, name), obj in current.items(): + key = f"{adapter}:{name}" + candidates.append((key, obj, _match_entry_point(obj, entry_points) or _source("runtime"))) + + for ep in entry_points: + source = _source("entry_point", ereg.EXECUTOR_ENTRYPOINT_GROUP, ep.name, ep.value) + try: + loaded = ep.load() + except Exception as e: + diagnostics.append( + _diag( + severity="error", + scope="executor", + code="entry_point_load_failed", + message=f"Executor plugin '{ep.name} ({ep.value})' failed: {e}", + source=source, + ) + ) + continue + loaded_specs: list[tuple[str, Any, dict[str, Any]]] = [] + _collect_executor_specs(loaded, source=source, out=loaded_specs, diagnostics=diagnostics) + candidates.extend(loaded_specs) + + grouped: dict[str, list[int]] = defaultdict(list) + for idx, (key, _obj, _src) in enumerate(candidates): + grouped[key].append(idx) + + effective_idx = {indexes[-1] for indexes in grouped.values()} + for key, indexes in grouped.items(): + if len(indexes) > 1: + diagnostics.append( + _diag( + severity="warning", + scope="executor", + code="duplicate_key", + message=f"Multiple executor registrations found for key '{key}'", + source=_source("runtime"), + key=key, + ) + ) + + registrations = [ + _registration("executor", key, obj, effective=idx in effective_idx) + for idx, (key, obj, _source) in enumerate(candidates) + ] + return registrations + + +def _codec_status(diagnostics: list[dict[str, Any]]): + entry_points = codec_mod._entry_points() + try: + codec_mod.ensure_literal_codec_plugins_loaded() + except Exception as e: + message = str(e) + diagnostics.append( + _diag( + severity="error", + scope="codec", + code="entry_point_load_failed", + message=message, + source=_match_codec_error_source(message, entry_points), + ) + ) + + with codec_mod._lock: + codec_items = list(codec_mod._literal_codecs) + + registrations = [] + for order, (priority, _seq, obj) in enumerate(codec_items): + type_name = getattr(type(obj), "__qualname__", type(obj).__name__) + registrations.append( + _registration( + "codec", + f"{priority}:{order}:{type_name}", + obj, + effective=True, + ) + ) + + return registrations + + +def status() -> dict[str, object]: + diagnostics: list[dict[str, Any]] = [] + adapters = _adapter_status(diagnostics) + executors = _executor_status(diagnostics) + codecs = _codec_status(diagnostics) + diagnostics.sort(key=lambda item: (item["scope"], item["code"], item["message"])) + + return { + "schema_version": 0, + "summary": { + "has_errors": any(item["severity"] == "error" for item in diagnostics), + "diagnostic_count": len(diagnostics), + "adapter_registration_count": len(adapters), + "adapter_effective_count": sum(1 for item in adapters if item["effective"]), + "executor_registration_count": len(executors), + "executor_effective_count": sum(1 for item in executors if item["effective"]), + "codec_registration_count": len(codecs), + "codec_effective_count": len(codecs), + }, + "adapters": adapters, + "executors": executors, + "codecs": codecs, + "diagnostics": diagnostics, + } diff --git a/src/daggerml/contrib/supervisor.py b/src/daggerml/contrib/supervisor.py new file mode 100644 index 0000000..ae2dab1 --- /dev/null +++ b/src/daggerml/contrib/supervisor.py @@ -0,0 +1,335 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import subprocess +import sys +import tempfile +import threading +import time +from pathlib import Path +from typing import Any + +from daggerml._internal import Dml, DmlRepoError + +logger = logging.getLogger(__name__) + +_CLOUDWATCH_LOG_GROUP = "dml" +_CLOUDWATCH_MAX_BATCH_BYTES = 1_048_576 +_CLOUDWATCH_MAX_MESSAGE_BYTES = 1_048_576 +_CLOUDWATCH_EVENT_OVERHEAD_BYTES = 26 +_CLOUDWATCH_MAX_BATCH_COUNT = 10_000 + + +def _create_logs_client() -> Any: + import boto3 + + endpoint_url = os.environ.get("AWS_ENDPOINT_URL") + kwargs = {"endpoint_url": endpoint_url} if endpoint_url else {} + return boto3.client("logs", **kwargs) + + +def _resource_already_exists(exc: Exception) -> bool: + return getattr(exc, "response", {}).get("Error", {}).get("Code") == "ResourceAlreadyExistsException" + + +class _CloudWatchStream: + def __init__(self, *, cache_key: str, execution_id: str, stream_kind: str): + self.cache_key = cache_key + self.execution_id = execution_id + self.stream_kind = stream_kind + self.stream_name = f"/run/{cache_key}/{stream_kind}" + self._client: Any | None = None + self._enabled = True + self._sequence_token: str | None = None + self._pending_events: list[dict[str, Any]] = [] + self._pending_bytes = 0 + self._lock = threading.Lock() + self._init_client() + self.emit_lifecycle(event="start") + + @staticmethod + def _event_bytes(message: str) -> int: + return len(message.encode("utf-8")) + _CLOUDWATCH_EVENT_OVERHEAD_BYTES + + @staticmethod + def _split_message(message: str) -> list[str]: + encoded = message.encode("utf-8") + if len(encoded) <= _CLOUDWATCH_MAX_MESSAGE_BYTES: + return [message] + + chunks: list[str] = [] + start = 0 + while start < len(encoded): + end = min(start + _CLOUDWATCH_MAX_MESSAGE_BYTES, len(encoded)) + while end > start: + try: + chunks.append(encoded[start:end].decode("utf-8")) + start = end + break + except UnicodeDecodeError: + end -= 1 + else: + raise AssertionError("failed to split UTF-8 message into valid chunks") + return chunks + + def _flush_locked(self) -> None: + if not self._enabled or self._client is None or not self._pending_events: + return + params: dict[str, Any] = { + "logGroupName": _CLOUDWATCH_LOG_GROUP, + "logStreamName": self.stream_name, + "logEvents": list(self._pending_events), + } + if self._sequence_token is not None: + params["sequenceToken"] = self._sequence_token + try: + response = self._client.put_log_events(**params) + self._sequence_token = response.get("nextSequenceToken") + self._pending_events.clear() + self._pending_bytes = 0 + except Exception as exc: + self._pending_events.clear() + self._pending_bytes = 0 + self._disable(f"event delivery failed: {exc}") + + def _init_client(self) -> None: + try: + client = _create_logs_client() + try: + client.create_log_group(logGroupName=_CLOUDWATCH_LOG_GROUP) + except Exception as exc: + if not _resource_already_exists(exc): + raise + try: + client.create_log_stream(logGroupName=_CLOUDWATCH_LOG_GROUP, logStreamName=self.stream_name) + except Exception as exc: + if not _resource_already_exists(exc): + raise + self._client = client + except Exception as exc: + self._disable(f"initialization failed: {exc}") + + def _disable(self, reason: str) -> None: + if not self._enabled: + return + self._enabled = False + self._client = None + logger.warning("CloudWatch logging disabled for %s: %s", self.stream_name, reason) + + def emit_lifecycle(self, *, event: str, terminal_status: str | None = None) -> None: + payload = { + "event": f"stream_{event}", + "execution_id": self.execution_id, + "cache_key": self.cache_key, + "stream": self.stream_kind, + } + if terminal_status is not None: + payload["terminal_status"] = terminal_status + self.emit(json.dumps(payload, sort_keys=True)) + + def emit(self, message: str) -> None: + if not self._enabled or self._client is None: + return + messages = self._split_message(message) + with self._lock: + for chunk in messages: + event_bytes = self._event_bytes(chunk) + if self._pending_events and ( + len(self._pending_events) >= _CLOUDWATCH_MAX_BATCH_COUNT + or self._pending_bytes + event_bytes > _CLOUDWATCH_MAX_BATCH_BYTES + ): + self._flush_locked() + if not self._enabled: + return + event = {"timestamp": round(time.time() * 1000), "message": chunk} + self._pending_events.append(event) + self._pending_bytes += event_bytes + + def close(self, *, terminal_status: str) -> None: + self.emit_lifecycle(event="end", terminal_status=terminal_status) + with self._lock: + self._flush_locked() + + +def _drain_pipe(pipe: Any, *, local_path: Path, sink: _CloudWatchStream) -> None: + with local_path.open("w") as local_file: + for line in pipe: + local_file.write(line) + local_file.flush() + sink.emit(line) + pipe.close() + + +def _parse_cmd_payload( + payload: dict[str, Any], +) -> tuple[str, str, list[str], dict[str, str], dict[str, str]]: + allowed = {"version", "cache_key", "execution_id", "cmd", "remote", "env"} + unknown = sorted(set(payload) - allowed) + if unknown: + raise DmlRepoError(f"Supervisor payload has unknown fields: {', '.join(unknown)}") + + version = payload.get("version") + if version != 0: + raise DmlRepoError("Supervisor payload version must be 0") + + cache_key = payload.get("cache_key") + if not isinstance(cache_key, str) or not cache_key: + raise DmlRepoError("Supervisor payload cache_key must be a non-empty string") + + execution_id = payload.get("execution_id") + if not isinstance(execution_id, str) or not execution_id: + raise DmlRepoError("Supervisor payload execution_id must be a non-empty string") + + cmd = payload.get("cmd") + if not isinstance(cmd, list) or not cmd or not all(isinstance(x, str) and x for x in cmd): + raise DmlRepoError("Supervisor payload cmd must be a non-empty list[str]") + + remote = payload.get("remote") + if not isinstance(remote, dict): + raise DmlRepoError("Supervisor payload remote must be a dict") + unknown_remote = sorted(set(remote) - {"root"}) + if unknown_remote: + raise DmlRepoError(f"Supervisor payload remote has unknown fields: {', '.join(unknown_remote)}") + if not isinstance(remote.get("root"), str): + raise DmlRepoError("Supervisor payload remote requires string root") + + env = payload.get("env") or {} + if not isinstance(env, dict) or not all(isinstance(k, str) and isinstance(v, str) for k, v in env.items()): + raise DmlRepoError("Supervisor payload env must be a dict[str,str]") + + merged_env = os.environ.copy() + merged_env.update(env) + return cache_key, execution_id, cmd, merged_env, {"root": remote["root"]} + + +def _validate_output(result: Any) -> dict[str, Any]: + if not isinstance(result, dict): + raise DmlRepoError("Supervisor result must be a dict") + status = result.get("status") + if status not in {"succeeded", "failed"}: + raise DmlRepoError("Supervisor result status must be one of succeeded|failed after worker exit") + if status == "failed": + expected = {"status", "error"} + if set(result.keys()) != expected: + raise DmlRepoError("Supervisor failed result keys must be exactly: status, error") + error = result.get("error") + if error is None: + raise DmlRepoError("Supervisor result failed requires error") + return result + + expected = {"status", "error", "dag_id"} + if set(result.keys()) != expected: + raise DmlRepoError("Supervisor succeeded result keys must be exactly: status, error, dag_id") + error = result.get("error") + if error is not None: + raise DmlRepoError("Supervisor result succeeded requires error=None") + dag_id = result.get("dag_id") + if not isinstance(dag_id, str) or not re.fullmatch(r"[0-9a-f]{64}", dag_id): + raise DmlRepoError("Supervisor result succeeded requires real dag_id") + return result + + +def run(payload: dict[str, Any]) -> dict[str, Any]: + """Launch a worker subprocess, wait for it to exit, and return the terminal result.""" + cache_key, execution_id, cmd, env, remote = _parse_cmd_payload(payload) + workdir = tempfile.mkdtemp(prefix=f"dml-supervisor-{execution_id[:8]}-") + repo_dir = Path(workdir) / "repo" + repo_dir.mkdir(parents=True, exist_ok=True) + Dml.init(str(repo_dir), remote_root=remote["root"], user="worker") + env = dict(env) + env["DML_PROJECT_HOME"] = str(repo_dir) + result_path = Path(workdir) / "result.json" + stdout_path = Path(workdir) / "stdout.log" + stderr_path = Path(workdir) / "stderr.log" + stdout_sink = _CloudWatchStream(cache_key=cache_key, execution_id=execution_id, stream_kind="stdout") + stderr_sink = _CloudWatchStream(cache_key=cache_key, execution_id=execution_id, stream_kind="stderr") + proc = subprocess.Popen( + cmd, + cwd=workdir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + bufsize=1, + start_new_session=False, + close_fds=True, + ) + assert proc.stdout is not None + assert proc.stderr is not None + stdout_thread = threading.Thread( + target=_drain_pipe, + kwargs={"pipe": proc.stdout, "local_path": stdout_path, "sink": stdout_sink}, + name=f"dml-supervisor-{execution_id[:8]}-stdout", + ) + stderr_thread = threading.Thread( + target=_drain_pipe, + kwargs={"pipe": proc.stderr, "local_path": stderr_path, "sink": stderr_sink}, + name=f"dml-supervisor-{execution_id[:8]}-stderr", + ) + stdout_thread.start() + stderr_thread.start() + proc.wait() + stdout_thread.join() + stderr_thread.join() + + result: dict[str, Any] + if result_path.exists(): + try: + parsed = json.loads(result_path.read_text()) + result = _validate_output(parsed) + except Exception as e: + result = {"status": "failed", "error": f"Supervisor could not read worker result: {e}"} + elif proc.returncode is not None and proc.returncode < 0: + import signal as _signal + + sig = -proc.returncode + try: + sig_name = _signal.Signals(sig).name + except ValueError: + sig_name = str(sig) + result = {"status": "failed", "error": f"Worker killed by signal {sig_name}"} + else: + code = proc.returncode if proc.returncode is not None else -1 + result = {"status": "failed", "error": f"Worker exited without result (code={code})"} + + terminal_status = str(result.get("status", "failed")) + stdout_sink.close(terminal_status=terminal_status) + stderr_sink.close(terminal_status=terminal_status) + return result + + +def _read(path: str) -> str: + if path == "-": + return sys.stdin.read() + return Path(path).read_text() + + +def _write(path: str, data: str) -> None: + if path == "-": + sys.stdout.write(data) + if not data.endswith("\n"): + sys.stdout.write("\n") + sys.stdout.flush() + return + Path(path).write_text(data) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="daggerml contrib supervisor") + parser.add_argument("-i", "--input", default="-") + parser.add_argument("-o", "--output", default="-") + args = parser.parse_args(argv or sys.argv[1:]) + payload = json.loads(_read(args.input)) + result = run(payload) + _write(args.output, json.dumps(result, separators=(",", ":"), sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/daggerml/contrib/testing.py b/src/daggerml/contrib/testing.py new file mode 100644 index 0000000..df5677e --- /dev/null +++ b/src/daggerml/contrib/testing.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import inspect +from collections.abc import Callable +from contextlib import chdir +from dataclasses import dataclass +from functools import wraps +from tempfile import TemporaryDirectory +from typing import Any, Generic, TypeVar + +from daggerml import Node +from daggerml._internal import DmlRepoError +from daggerml.contrib.api import DelayedRunnable + +T = TypeVar("T") + + +@dataclass(frozen=True) +class MockNode(Generic[T]): + _value: T + + def value(self) -> T: + return self._value + + @classmethod + def from_value(cls, value: T) -> MockNode | Node: + if isinstance(value, (Node, MockNode)): + return value + return cls(value) + + +def wrap_node(arg: Any) -> Any: + if isinstance(arg, (Node, MockNode)): + return arg + return MockNode(arg) + + +def defunkify(value: DelayedRunnable) -> Callable[..., Any]: + current = value + while isinstance(current.sub, DelayedRunnable): + current = current.sub + if current.uri != "script": + raise DmlRepoError("defunkify requires innermost script delayed runnable") + fn = current.kwargs.get("fn") + if not callable(fn): + raise DmlRepoError("defunkify requires callable fn in innermost script kwargs") + + sig = inspect.signature(fn) + param_names = tuple(sig.parameters) + + @wraps(fn) + def wrapped(*args: Any, **kwargs: Any) -> Any: + bound = sig.bind_partial(*args, **kwargs) + bound.apply_defaults() + for name, param in sig.parameters.items(): + if name not in bound.arguments or name == param_names[0]: + continue + if param.kind == inspect.Parameter.VAR_POSITIONAL: + bound.arguments[name] = tuple(wrap_node(arg) for arg in bound.arguments[name]) + elif param.kind == inspect.Parameter.VAR_KEYWORD: + bound.arguments[name] = {key: wrap_node(arg) for key, arg in bound.arguments[name].items()} + else: + bound.arguments[name] = wrap_node(bound.arguments[name]) + with TemporaryDirectory(prefix="dml-defunkify-") as tmpd: + with chdir(tmpd): + return fn(*bound.args, **bound.kwargs) + + return wrapped + + +__all__ = ["MockNode", "defunkify"] diff --git a/src/daggerml/core.py b/src/daggerml/core.py deleted file mode 100644 index 123d030..0000000 --- a/src/daggerml/core.py +++ /dev/null @@ -1,903 +0,0 @@ -import json -import logging -import shutil -import subprocess -import time -import traceback as tb -from dataclasses import dataclass, field, fields -from tempfile import TemporaryDirectory -from typing import Any, Callable, Dict, Iterator, Optional, Union, cast, overload - -from daggerml.util import BackoffWithJitter, current_time_millis, kwargs2opts, raise_ex, replace - -log = logging.getLogger(__name__) - -DATA_TYPE = {} - -Scalar = Union[str, int, float, bool, type(None), "Resource", "Executable"] -Collection = Union[list, tuple, set, dict] - - -def dml_type(cls=None, **opts): - def decorator(cls): - DATA_TYPE[opts.get("alias", None) or cls.__name__] = cls - return cls - - return decorator(cls) if cls else decorator - - -def from_data(data): - n, *args = data if isinstance(data, list) else [None, data] - if n is None: - return args[0] - if n == "l": - return [from_data(x) for x in args] - if n == "s": - return {from_data(x) for x in args} - if n == "d": - return {k: from_data(v) for (k, v) in args} - if n in DATA_TYPE: - return DATA_TYPE[n](*[from_data(x) for x in args]) - raise ValueError(f"no decoder for type: {n}") - - -def to_data(obj): - if isinstance(obj, Node): - obj = obj.ref - if isinstance(obj, tuple): - obj = list(obj) - n = obj.__class__.__name__ - if isinstance(obj, (type(None), str, bool, int, float)): - return obj - if isinstance(obj, (list, set)): - return [n[0], *[to_data(x) for x in obj]] - if isinstance(obj, dict): - return [n[0], *[[k, to_data(v)] for k, v in obj.items()]] - if n in DATA_TYPE: - return [n, *[to_data(getattr(obj, x.name)) for x in fields(obj)]] - raise ValueError(f"no encoder for type: {n}") - - -def from_json(text): - return from_data(json.loads(text)) - - -def to_json(obj): - return json.dumps(to_data(obj), separators=(",", ":")) - - -@dml_type -@dataclass(frozen=True) -class Ref: # noqa: F811 - """ - Reference to a DaggerML object. - - Parameters - ---------- - to : str - Reference identifier - """ - - to: str - - -@dml_type -@dataclass -class Resource: # noqa: F811 - """ - Representation of an externally managed object with an identifier. - - Parameters - ---------- - uri : str - Resource URI - """ - - uri: str - - -@dml_type -@dataclass -class Executable(Resource): # noqa: F811 - """ - Representation of an executable externally managed object with an identifier. - - Parameters - ---------- - uri : str - Resource URI - data : str, optional - Associated data - adapter : str, optional - Adapter cli script - """ - - data: dict = field(default_factory=dict) - adapter: Optional[str] = None - prepop: Dict[str, Union["Node", Scalar, Collection]] = field(default_factory=dict) - - -@dml_type -@dataclass -class Error(Exception): - message: str - origin: str - type: str - stack: list[dict] = field(default_factory=list) - - @classmethod - def from_ex(cls, ex: BaseException) -> "Error": - if isinstance(ex, Error): - return ex - return cls( - message=str(ex), - origin="python", - type=ex.__class__.__name__, - stack=[ - { - "filename": frame.filename, - "function": frame.name, - "lineno": frame.lineno, - "line": (frame.line or "").strip(), - } - for frame in tb.extract_tb(ex.__traceback__) - ], - ) - - def __str__(self): - lines = [f"Traceback (most recent call last) from {self.origin}:\n"] - for frame in self.stack: - lines.append(f' File "{frame["filename"]}", line {frame["lineno"]}, in {frame["function"]}\n') - if "line" in frame and frame["line"]: - lines.append(f" {frame['line']}\n") - lines.append(f"{self.type}: {self.message}") - return "".join(lines) - - -@dataclass -class Dml: - """ - DaggerML cli client wrapper - """ - - config_dir: Union[str, None] = None - project_dir: Union[str, None] = None - cache_path: Union[str, None] = None - repo: Union[str, None] = None - user: Union[str, None] = None - branch: Union[str, None] = None - token: Union[str, None] = None - tmpdirs: dict[str, TemporaryDirectory] = field(default_factory=dict) - - @property - def index(self) -> Optional[str]: - if self.token: - return json.loads(self.token)[-1] - - @property - def kwargs(self) -> dict: - out = { - "config_dir": self.config_dir, - "project_dir": self.project_dir, - "cache_path": self.cache_path, - "repo": self.repo, - "user": self.user, - "branch": self.branch, - } - return {k: v for k, v in out.items() if v is not None} - - @classmethod - def temporary(cls, repo="test", user="user", branch="main", cache_path=None, **kwargs) -> "Dml": - """ - Create a temporary Dml instance with specified parameters. - - Parameters - ---------- - repo : str, default="test" - user : str, default="user" - branch : str, default="main" - **kwargs : dict - Additional keyword arguments for configuration include `config_dir`, `project_dir`, and `cache_path`. - If any of those is provided, it will not create a temporary directory for that parameter. If provided and - set to None, the dml default will be used. - """ - tmpdirs = {k: TemporaryDirectory(prefix="dml-") for k in ["config_dir", "project_dir"] if k not in kwargs} - self = cls( - repo=repo, - user=user, - branch=branch, - cache_path=cache_path, - **{k: v.name for k, v in tmpdirs.items()}, - tmpdirs=tmpdirs, - ) - if self.kwargs["repo"] not in [x["name"] for x in self("repo", "list")]: - self("repo", "create", self.kwargs["repo"]) - return self - - def cleanup(self): - [x.cleanup() for x in self.tmpdirs.values()] - - def __call__(self, *args: str, input=None, as_text: bool = False) -> Any: - path = shutil.which("dml") - argv = [path, *kwargs2opts(**self.kwargs), *args] - resp = subprocess.run(argv, check=False, capture_output=True, text=True, input=input) - if resp.returncode != 0: - raise_ex(Error(resp.stderr or "DML command failed", origin="dml", type="CliError")) - log.debug("dml command stderr: %s", resp.stderr) - if resp.stderr: - log.error(resp.stderr.rstrip()) - try: - resp = resp.stdout or "" if as_text else json.loads(resp.stdout or "null") - except json.decoder.JSONDecodeError: - pass - return resp - - def __getattr__(self, name: str): - def invoke(*args, **kwargs): - opargs = to_json([name, args, kwargs]) - token = self.token or to_json([]) - return raise_ex(from_data(self("api", "invoke", token, input=opargs))) - - return invoke - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.cleanup() - - @property - def envvars(self): - return {f"DML_{k.upper()}": str(v) for k, v in self.kwargs.items()} - - def new(self, name="", message="", data=None, message_handler=None) -> "Dag": - opts = kwargs2opts(dump="-") if data else [] - token = self("api", "create", *opts, name, message, input=data, as_text=True) - return Dag(replace(self, token=token), message_handler) - - def load(self, name: Union[str, "Node"], recurse=False) -> "Dag": - return Dag(replace(self, token=None), ref=self.get_dag(name, recurse=recurse)) - - -def make_node(dag: "Dag", ref: Ref) -> "Node": - """ - Create a Node from a Dag and Ref. - - Parameters - ---------- - dag : Dag - The parent DAG. - ref : Ref - The reference to the node. - - Returns - ------- - Node - A Node instance representing the reference in the DAG. - """ - info = dag.dml("node", "describe", ref.to) - if info["data_type"] == "list": - node = ListNode(dag, ref, _info=info) - elif info["data_type"] == "dict": - node = DictNode(dag, ref, _info=info) - elif info["data_type"] == "set": - node = ListNode(dag, ref, _info=info) - elif info["data_type"] == "executable": - node = ExecutableNode(dag, ref, _info=info) - else: - node = ScalarNode(dag, ref, _info=info) - if info["doc"]: - object.__setattr__(node, "__doc__", info["doc"]) - return node - - -@dataclass -class Dag: - dml: Dml - message_handler: Optional[Callable] = None - ref: Optional[Ref] = None - - def __repr__(self): - to = self.ref.to if self.ref else self.dml.index or "NA" - return f"Dag({to})" - - def __hash__(self): - "Useful only for tests." - return 42 - - def __enter__(self): - "Catch exceptions and commit an Error" - assert not self.ref - return self - - def __exit__(self, exc_type, exc_value, traceback): - if exc_value is not None: - self.commit(Error.from_ex(exc_value)) - - def __getitem__(self, name): - return make_node(self, self.dml.get_node(name, self.ref)) - - def __setitem__(self, name, value): - assert not self.ref - if isinstance(value, Ref): - return self.dml.set_node(name, value) - return self.put(value, name=name) - - def __setattr__(self, name, value): - if name in [x.name for x in fields(self.__class__)]: - return super().__setattr__(name, value) - return self.__setitem__(name, value) - - def __getattr__(self, name): - if name in [x.name for x in fields(self.__class__)]: - return super().__getattribute__(name) - return self.__getitem__(name) - - def __len__(self) -> int: - return len(self.dml.get_names(self.ref)) - - def __iter__(self): - yield from self.keys() - - def keys(self) -> list[str]: - """Get the list of all node names in the dag""" - return self.dml.get_names(self.ref).keys() - - def values(self) -> list["Node"]: - """Get the list of all nodes in the dag""" - nodes = self.dml.get_names(self.ref).values() - return [make_node(self, x) for x in nodes] - - @property - def argv(self) -> "ListNode": - "Access the dag's argv node" - return make_node(self, self.dml.get_argv(self.ref)) - - @property - def result(self) -> "Node": - """Get the result node of the dag""" - ref = self.dml.get_result(self.ref) - assert isinstance(ref, Ref), f"'{self.__class__.__name__}' dag has not been committed yet" - return make_node(self, ref) - - def load(self, dag_name: str, key: str = "result", *, name=None, doc=None) -> "Node": - """Load a node from a different dag into this one - - Parameters - ---------- - dag_name : str - Name of the dag to load - key : str, default="result" - The name of the node (or "result") to import from the loaded dag. By default, it imports the result node. - name : str, optional - Name to assign the resulting node in this dag - doc : str, optional - Documentation for the node - - Returns - ------- - Node - Import Node representing the result of the loaded dag - - Examples - -------- - >>> dml = Dml.temporary() - >>> dml.new("my-dag-0", "going to import this").commit(42) - >>> dag = dml.new("my-dag-1", "importing my-dag-0") - >>> node = dag.load("my-dag-0") - >>> node.value() - 42 - """ - resp = getattr(self.dml.load(dag_name), key, None) - if resp is None: - raise_ex(Error(f"dag '{dag_name}' has no '{key}'", origin="dml", type="KeyError")) - return self.put(resp, name=name, doc=doc) - - @overload - def put(self, value: Union[list, set, "ListNode"], *, name=None, doc=None) -> "ListNode": ... - @overload - def put(self, value: Union[dict, "DictNode"], *, name=None, doc=None) -> "DictNode": ... - @overload - def put(self, value: Union[Executable, "ExecutableNode"], *, name=None, doc=None) -> "ExecutableNode": ... - @overload - def put(self, value: Union[Scalar, "ScalarNode"], *, name=None, doc=None) -> "ScalarNode": ... - @overload - def put(self, value: "Node", *, name=None, doc=None) -> "Node": ... - def put(self, value: Union[Scalar, Collection, "Node"], *, name=None, doc=None) -> "Node": - """ - Add a value to the DAG. - - Parameters - ---------- - value : Union[Scalar, Collection] - Value to add - name : str, optional - Name for the node - doc : str, optional - Documentation - - Returns - ------- - Node - Node representing the value - - Examples - -------- - >>> dml = Dml.temporary() - >>> dag = dml.new("test", "test") - >>> n1 = dag.put(42, name="answer", doc="the answer to life, the universe, and everything") - >>> n1.value() - 42 - >>> n2 = dag.put({"a": 1, "b": [n1, "23"]}) - >>> n2.value() - {'a': 1, 'b': [42, '23']} - >>> dml.new("other-dag", "we'll import from here").commit(308) # create and commit another dag to import - >>> n3 = dag.load("other-dag") - >>> n3.value() - 308 - """ - if isinstance(value, Node) and value.dag != self: - return make_node(self, self.dml.put_load(value.dag.ref, value.ref, name=name, doc=doc)) - return make_node(self, self.dml.put_literal(value, name=name, doc=doc)) - - def call( - self, - fn: Union[Executable, "ExecutableNode"], - *args: Union["Node", Scalar, Collection], - name: Optional[str] = None, - doc: Optional[str] = None, - sleep: Optional[callable] = None, - timeout: int = -1, - **kw, - ) -> "Node": - """ - Call a function node with arguments. - - Parameters - ---------- - fn : Union[Executable, ExecutableNode] - Function to call - *args : Union[Node, Scalar, Collection] - Arguments to pass to the function - name : str, optional - Name for the result node - doc : str, optional - Documentation - sleep : callable, optional - A nullary function that returns sleep time in milliseconds - timeout : int, default=-1 - Maximum time to wait in milliseconds. If <= 0, wait indefinitely. - **kw : dict - Keyword arguments override any prepop values in the Executable (fn). - - Returns - ------- - Node - Result node - - Raises - ------ - TimeoutError - If the function call exceeds the timeout - Error - If the function returns an error - """ - if len(kw) > 0: - if isinstance(fn, Node): - fn = fn.value() - if set(kw) - set(fn.prepop): - extras = sorted(set(kw) - set(fn.prepop)) - msg = f"Function called with extraneous kwargs (not in `fn.prepop`): {extras}" - raise Error(msg, origin="dml", type="KeyError") - fn = Executable(uri=fn.uri, data=fn.data, adapter=fn.adapter, prepop={**fn.prepop, **kw}) - # FIXME: replace fails: `TypeError: Executable.__init__() missing 1 required positional argument: 'uri'` - # fn = replace(fn, prepop={**fn.prepop, **kw}) - sleep = sleep or BackoffWithJitter() - expr = [self.put(x) for x in [fn, *args]] - end = current_time_millis() + timeout - while timeout <= 0 or current_time_millis() < end: - resp = self.dml.start_fn(expr, name=name, doc=doc) - if resp: - return make_node(self, resp) - time.sleep(sleep() / 1000) - raise TimeoutError(f"invoking function: {expr[0].value()}") - - def commit(self, value) -> None: - """ - Commit a value to the DAG. - - Parameters - ---------- - value : Union[Node, Error, Any] - Value to commit - """ - value = value if isinstance(value, (Node, Error)) else self.put(value) - ref = cast(Ref, self.dml.commit(value)) - if self.message_handler: - self.message_handler(self.dml("ref", "dump", to_json(ref), as_text=True)) - self.ref = ref - - -@dataclass(frozen=True) -class Node: # noqa: F811 - """ - Representation of a node in a DaggerML DAG. - - Parameters - ---------- - dag : Dag - Parent DAG - ref : Ref - Node reference - """ - - dag: Dag - ref: Ref - _info: dict = field(default_factory=dict) - - def __repr__(self): - ref_id = self.ref if isinstance(self.ref, Error) else self.ref.to - return f"{self.__class__.__name__}({ref_id})" - - def __hash__(self): - return hash(self.ref) - - @property - def argv(self) -> "Node": - "Access the node's argv list" - return [make_node(self.dag, x) for x in self.dag.dml.get_argv(self)] - - def backtrack(self, *keys: Union[str, int]) -> "Node": - """ - If `key` is provided, it considers this node to be a collection created - by the appropriate method and loads the dag that corresponds to this key - - Parameters - ---------- - *keys : str, optional - Keys to backtrack through the node's structure - - Returns - ------- - Dag - The dag that this node was imported from (or in the case of a function call, this returns the fndag) - - Examples - -------- - >>> dml = Dml.temporary() - >>> dag = dml.new("test", "test") - >>> l0 = dag.put(42) - >>> c0 = dag.put({"a": 1, "b": [l0, "23"]}) - >>> assert c0.backtrack("b", 0) == l0 - >>> assert c0.backtrack("b").backtrack(0) == l0 - >>> assert c0["b"][0] != l0 # this is a different node, not the same as l0 - >>> dml.cleanup() - """ - data = self.dag.dml("node", "backtrack", self.ref.to, *map(str, keys)) - return make_node(self.dag, from_data(data)) - - def load(self) -> Dag: - """ - Convenience wrapper around `dml.load(node)` - - Returns - ------- - Dag - The dag that this node was imported from (or in the case of a function call, this returns the fndag) - """ - return self.dag.dml.load(self) - - @property - def type(self): - """Get the data type of the node.""" - return self._info["data_type"] - - @overload - def value(self: "ScalarNode") -> Scalar: ... - @overload - def value(self: "ListNode") -> list: ... - @overload - def value(self: "DictNode") -> dict: ... - @overload - def value(self: "ExecutableNode") -> Executable: ... - @overload - def value(self: "Node") -> Any: ... - def value(self): - """ - Get the concrete value of this node. - - Returns - ------- - Any - The actual value represented by this node - """ - return self.dag.dml.get_node_value(self.ref) - - -class ScalarNode(Node): - pass - - -class ExecutableNode(Node): - def __call__(self, *args, name=None, doc=None, sleep=None, timeout=-1, **kw) -> "Node": - """ - Call this node as a function. - - Parameters - ---------- - *args : Any - Arguments to pass to the function - name : str, optional - Name for the result node - doc : str, optional - Documentation - sleep : callable, optional - A nullary function that returns sleep time in milliseconds - timeout : int, default=-1 - Maximum time to wait in milliseconds. -1 means wait forever. - **kw : dict - Keyword arguments override any prepop values in the Executable (fn). - - Returns - ------- - Node - Result node - - Raises - ------ - TimeoutError - If the function call exceeds the timeout - Error - If the function returns an error - """ - return self.dag.call(self, *args, name=name, doc=doc, sleep=sleep, timeout=timeout, **kw) - - -class CollectionNode(Node): # noqa: F811 - """ - Representation of a collection node in a DaggerML DAG. - - Parameters - ---------- - dag : Dag - Parent DAG - ref : Ref - Node reference - """ - - @overload - def __getitem__(self, key: slice) -> "ListNode": ... - @overload - def __getitem__(self, key: Union[str, int, "Node"]) -> Any: ... - def __getitem__(self, key: Union[slice, str, int, "Node"]) -> Any: - """ - Get the `key` item. It should be the same as if you were working on the - actual value. - - Returns - ------- - Node - Node with the length of the collection - - Raises - ------ - Error - If the node isn't a collection (e.g. list, set, or dict). - - Examples - -------- - >>> dml = Dml.temporary() - >>> dag = dml.new("test", "test") - >>> node = dag.put({"a": 1, "b": [5, 6]}) - >>> nested = node["a"] - >>> isinstance(nested, Node) - True - >>> nested.value() - 1 - >>> node["b"][0].value() # lists too - 5 - """ - if isinstance(key, slice): - key = [key.start, key.stop, key.step] - return make_node(self.dag, self.dag.dml.get(self, key)) - - def contains(self, item, *, name=None, doc=None) -> "ScalarNode": - """ - For collection nodes, checks to see if `item` is in `self` - - Returns - ------- - Node - Node with the boolean of is `item` in `self` - """ - return make_node(self.dag, self.dag.dml.contains(self, item, name=name, doc=doc)) - - def __contains__(self, item): - return self.contains(item).value() # has to return boolean - - def __len__(self): # python requires this to be an int - """ - Get the node's length - - Returns - ------- - Node - Node with the length of the collection - - Raises - ------ - Error - If the node isn't a collection (e.g. list, set, or dict). - """ - return self._info["length"] - - -class ListNode(CollectionNode): # noqa: F811 - """ - Representation of a collection node in a DaggerML DAG. - - Parameters - ---------- - dag : Dag - Parent DAG - ref : Ref - Node reference - """ - - def __iter__(self): - """ - Iterate over the node's values (items if it's a list, and keys if it's a - dict) - - Returns - ------- - Node - Result node - - Raises - ------ - Error - If the node isn't a collection (e.g. list, set, or dict). - """ - for i in range(len(self)): - yield self[i] - - def conj(self, item, *, name=None, doc=None) -> "ListNode": - """ - For a list or set node, append an item - - Returns - ------- - Node - Node containing the new collection - - Notes - ----- - `append` is an alias `conj` - """ - return make_node(self.dag, self.dag.dml.conj(self, item, name=name, doc=doc)) - - def append(self, item, *, name=None, doc=None) -> "ListNode": - """ - For a list or set node, append an item - - Returns - ------- - Node - Node containing the new collection - - See Also - -------- - conj : The main implementation - """ - return self.conj(item, name=name, doc=doc) - - -class DictNode(CollectionNode): # noqa: F811 - def keys(self) -> list[str]: - """ - Get the keys of a dictionary node. - - Parameters - ---------- - name : str, optional - Name for the result node - doc : str, optional - Documentation - - Returns - ------- - list[str] - List of keys in the dictionary node - """ - return self._info["keys"].copy() - - def __iter__(self): - """ - Iterate over the node's values (items if it's a list, and keys if it's a - dict) - - Returns - ------- - Node - Result node - - Raises - ------ - Error - If the node isn't a collection (e.g. list, set, or dict). - """ - for k in self.keys(): - yield k - - def get(self, key, default=None, *, name=None, doc=None) -> "Node": - """ - For a dict node, return the value for key if key exists, else default. - - If default is not given, it defaults to None, so that this method never raises a KeyError. - """ - return make_node(self.dag, self.dag.dml.get(self, key, default, name=name, doc=doc)) - - def items(self) -> Iterator[tuple[str, "Node"]]: - """ - Iterate over key-value pairs of a dictionary node. - - Returns - ------- - Iterator[tuple[Node, Node]] - Iterator over (key, value) pairs - """ - if self.type != "dict": - raise Error(f"Cannot iterate items of type: {self.type}", origin="dml", type="TypeError") - for k in self: - yield k, self[k] - - def values(self) -> list["Node"]: - """ - Get the values of a dictionary node. - - Parameters - ---------- - name : str, optional - Name for the result node - doc : str, optional - Documentation - - Returns - ------- - list[Node] - List of values in the dictionary node - """ - return [self[k] for k in self] - - def assoc(self, key, value, *, name=None, doc=None) -> "DictNode": - """ - For a dict node, associate a new value into the map - - Returns - ------- - Node - Node containing the new dict - """ - return make_node(self.dag, self.dag.dml.assoc(self, key, value, name=name, doc=doc)) - - def update(self, update) -> "DictNode": - """ - For a dict node, update like python dicts - - Returns - ------- - Node - Node containing the new collection - - Notes - ----- - calls `assoc` iteratively for k, v pairs in update. - - See Also - -------- - assoc : The main implementation - """ - for k, v in update.items(): - self = self.assoc(k, v) - return self diff --git a/src/daggerml/util.py b/src/daggerml/util.py index d4696e4..c093cc4 100644 --- a/src/daggerml/util.py +++ b/src/daggerml/util.py @@ -1,7 +1,17 @@ +import json +import logging +import os import time +import urllib.request from dataclasses import dataclass from random import randint +import boto3 +from botocore.client import Config +from botocore.exceptions import BotoCoreError, NoRegionError + +logger = logging.getLogger(__name__) + def snake2kebab(x: str) -> str: return x.replace("_", "-") @@ -11,22 +21,6 @@ def flatten(nested: list[list]) -> list: return [x for xs in nested for x in xs] -def kwargs2opts(*args, **kwargs) -> list[str]: - x = {f"--{snake2kebab(k)}": v for k, v in kwargs.items()} - return flatten([[k] if v is True else [k, v] for k, v in x.items()]) - - -def raise_ex(x): - if isinstance(x, Exception): - raise x - return x - - -def assocattr(x, k, v): - setattr(x, k, v) - return x - - def current_time_millis(): return round(time.time() * 1000) @@ -80,3 +74,93 @@ class BackoffWithJitter: def __call__(self): self.state = min(self.max, randint(self.min, max(self.min, self.state) * self.k)) return self.state + + +def _get_region_from_metadata(): + """ + Attempts to retrieve the AWS region from ECS or EC2 metadata. + + Returns + ------- + Optional[str] + The AWS region string if found, otherwise None. + """ + # ECS (used in AWS Batch) + metadata_uri = os.environ.get("ECS_CONTAINER_METADATA_URI_V4") or os.environ.get("ECS_CONTAINER_METADATA_URI") + if metadata_uri: + try: + with urllib.request.urlopen(metadata_uri, timeout=2) as response: + metadata = json.load(response) + cluster_label = metadata.get("Labels", {}).get("com.amazonaws.ecs.cluster", "") + region = cluster_label.split(":")[0] if ":" in cluster_label else None + if region: + return region + except Exception as e: + logger.warning("Failed to get region from ECS metadata: %s", e) + # EC2 fallback: use IMDSv2 + try: + token_req = urllib.request.Request( + "http://169.254.169.254/latest/api/token", + method="PUT", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "60"}, + ) + with urllib.request.urlopen(token_req, timeout=2) as token_response: + token = token_response.read().decode() + region_req = urllib.request.Request( + "http://169.254.169.254/latest/dynamic/instance-identity/document", + headers={"X-aws-ec2-metadata-token": token}, + ) + with urllib.request.urlopen(region_req, timeout=2) as region_response: + identity_doc = json.load(region_response) + return identity_doc.get("region") + except Exception as e: + logger.warning("Failed to get region from EC2 metadata: %s", e) + return + + +def get_client(name, region=None, default_region="us-east-1"): + """ + Creates a robust boto3 client, determining the AWS region in the following order: + 1. Explicit argument + 2. AWS_REGION / AWS_DEFAULT_REGION environment variables + 3. boto3/botocore session + 4. ECS/EC2 metadata + 5. Fallback default region (us-east-1) + + Parameters + ---------- + name : str + The name of the AWS service client. + region : Optional[str], default=None + The AWS region to use. + default_region : str, default="us-east-1" + The fallback AWS region. + + Returns + ------- + boto3.client + A boto3 client for the specified service. + """ + # Step 1–3: Try common boto3 config methods + region = region or os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") + if not region: + try: + region = boto3.Session().region_name + except (BotoCoreError, NoRegionError, ConnectionRefusedError): + logger.debug("could not instantiate boto client...") + pass + # Step 4: Metadata if still no region + if not region: + logger.debug("inferring aws region from metadata") + region = _get_region_from_metadata() + # Step 5: Fallback default + if not region: + logger.warning(f"falling back to default region '{default_region}'") + region = default_region + config = Config( + region_name=region, + connect_timeout=5, + retries={"max_attempts": 5, "mode": "adaptive"}, + max_pool_connections=20, + ) + return boto3.client(name, config=config) diff --git a/submodules/daggerml_cli b/submodules/daggerml_cli deleted file mode 160000 index dfeb3be..0000000 --- a/submodules/daggerml_cli +++ /dev/null @@ -1 +0,0 @@ -Subproject commit dfeb3be3d2188d65f2b25b4a87a605a75ee5d3ae diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..7cc9f9a --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,12 @@ +import os + +from daggerml import temporary as _temporary + + +def temporary_dml(*, repo: str | None = None, remote_root: str | None = None, **kw): + if repo is not None and "remote_project" not in kw: + kw["remote_project"] = f"dml://test/{repo}" + return _temporary( + remote_root=remote_root or os.environ["DML_REMOTE_ROOT"], + **kw, + ) diff --git a/tests/assets/fns/async.py b/tests/assets/fns/async.py index 9b5de33..b327f60 100644 --- a/tests/assets/fns/async.py +++ b/tests/assets/fns/async.py @@ -1,21 +1,59 @@ +import hashlib import json import os import sys +import tempfile +from pathlib import Path -from daggerml import Dml +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import NAMESPACES, Error if __name__ == "__main__": - stdin = json.loads(sys.stdin.read()) - with Dml.temporary(cache_path=stdin["cache_path"]) as dml: - cache_dir = os.getenv("DML_FN_CACHE_DIR", "") - cache_file = os.path.join(cache_dir, stdin["cache_key"]) - debug_file = os.path.join(cache_dir, "debug") + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + execution_cache_key = envelope["cache_key"] + remote_root = remote["root"] + cache_key = hashlib.sha256(argv_ptr.encode()).hexdigest() + cache_dir = os.getenv("DML_TEST_FN_STATE_DIR", "") + cache_file = os.path.join(cache_dir, cache_key) + debug_file = os.path.join(cache_dir, "debug") - with open(debug_file, "a") as f: - f.write("ASYNC EXECUTING\n") + with open(debug_file, "a", encoding="utf-8") as fh: + fh.write("ASYNC EXECUTING\n") - if os.path.isfile(cache_file): - with dml.new("test", "test", stdin["dump"], print) as d0: - d0.commit(sum(d0.argv[1:].value())) - else: - open(cache_file, "w").close() + if not os.path.isfile(cache_file): + open(cache_file, "w", encoding="utf-8").close() + print( + json.dumps( + {"status": "running", "error": None, "state": {"cache_file": cache_file}}, + separators=(",", ":"), + ) + ) + raise SystemExit(0) + + with execution_context(execution_id, execution_cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + node_ops = NodeOps(db) + argv = node_ops.unroll(ops.get_argv(index_ref)) + try: + result = ops.put_literal(index_ref, sum(argv[1:])) + except Exception as e: + result = Error.from_ex(e) + commit_ref = ops.commit(index_ref, result, message="async") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/assets/fns/sum.py b/tests/assets/fns/sum.py index 688804e..eb52033 100644 --- a/tests/assets/fns/sum.py +++ b/tests/assets/fns/sum.py @@ -1,14 +1,42 @@ import json import sys +import tempfile +from pathlib import Path from uuid import uuid4 -from daggerml import Dml +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import NAMESPACES if __name__ == "__main__": - stdin = json.loads(sys.stdin.read()) - with Dml.temporary(cache_path=stdin["cache_path"]) as dml: - with dml.new("test", "test", stdin["dump"], print) as dag: - dag.put(len(dag.argv[1:]), name="num_args") - dag.put(sum(dag.argv[1:].value()), name="n0") - dag.put(str(uuid4()), name="uuid") - dag.commit(dag.n0) + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + runnable_kwargs = envelope.get("runnable", {}).get("kwargs", {}) + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + node_ops = NodeOps(db) + argv = node_ops.unroll(ops.get_argv(index_ref)) + for key, value in runnable_kwargs.items(): + ops.put_literal(index_ref, value, name=key) + ops.put_literal(index_ref, len(argv[1:]), name="num_args") + result = ops.put_literal(index_ref, sum(argv[1:]), name="n0") + ops.put_literal(index_ref, str(uuid4()), name="uuid") + commit_ref = ops.commit(index_ref, result, message="sum") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/assets/fns/timeout.py b/tests/assets/fns/timeout.py index 56c09fe..e1d4fcf 100644 --- a/tests/assets/fns/timeout.py +++ b/tests/assets/fns/timeout.py @@ -1,2 +1,4 @@ +import json + if __name__ == "__main__": - exit() + print(json.dumps({"status": "running", "error": None, "state": {}}, separators=(",", ":"))) diff --git a/tests/assets/internal_fn/adapter-error.py b/tests/assets/internal_fn/adapter-error.py new file mode 100644 index 0000000..a20c059 --- /dev/null +++ b/tests/assets/internal_fn/adapter-error.py @@ -0,0 +1,47 @@ +import json +import sys +import tempfile +from pathlib import Path + +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.types import DEFAULT_HEAD, NAMESPACES, Commit, Error, Tree + + +def _init_repo(db: DmlDbEnv) -> None: + with BaseOps(db)._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="initial")) + HeadOps(_db=db).create_branch(DEFAULT_HEAD, commit_ref) + + +if __name__ == "__main__": + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + _init_repo(db) + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + try: + raise ValueError("test error") + except Exception as e: + result = Error.from_ex(e) + commit_ref = ops.commit(index_ref, result, message="adapter_error function result") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/assets/internal_fn/delayed-sum.py b/tests/assets/internal_fn/delayed-sum.py new file mode 100644 index 0000000..7530e54 --- /dev/null +++ b/tests/assets/internal_fn/delayed-sum.py @@ -0,0 +1,74 @@ +import json +import os +import sys +import tempfile +from pathlib import Path +from typing import cast + +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import DEFAULT_HEAD, NAMESPACES, Commit, Error, Tree + + +def _init_repo(db: DmlDbEnv) -> None: + with BaseOps(db)._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="initial")) + HeadOps(_db=db).create_branch(DEFAULT_HEAD, commit_ref) + + +if __name__ == "__main__": + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + tmp_dir = os.environ.get("DML_TMP_DIR") + if not tmp_dir: + raise ValueError("DML_TMP_DIR environment variable not set") + completion_file = Path(tmp_dir) / "completion.flag" + if not completion_file.exists(): + completion_file.touch() + print( + json.dumps( + {"status": "running", "error": None, "state": {"completion_file": str(completion_file)}}, + separators=(",", ":"), + ) + ) + raise SystemExit(0) + + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmprepo: + db_path = Path(tmprepo) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + _init_repo(db) + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + node_ops = NodeOps(db) + + argv = cast(list, node_ops.unroll(ops.get_argv(index_ref))) + _, *args = argv + + try: + for i, arg in enumerate(args): + if not isinstance(arg, (int, float)): + raise TypeError(f"Argument at index {i} is {type(arg).__name__}, expected int or float") + result = ops.put_literal(index_ref, float(sum(cast(list[float], args)))) + except Exception as e: + result = Error.from_ex(e) + + commit_ref = ops.commit(index_ref, result, message="delayed sum function result") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() + completion_file.unlink() diff --git a/tests/assets/internal_fn/prepop.py b/tests/assets/internal_fn/prepop.py new file mode 100644 index 0000000..057a6c9 --- /dev/null +++ b/tests/assets/internal_fn/prepop.py @@ -0,0 +1,37 @@ +import json +import sys +import tempfile +from pathlib import Path + +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import NAMESPACES + +if __name__ == "__main__": + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + node_ops = NodeOps(db) + argv: list[float] = node_ops.unroll(ops.get_argv(index_ref)) + kwargv: dict = node_ops.unroll(ops.get_kwargv(index_ref)) + result = ops.put_literal(index_ref, float(sum(argv[1:]) * kwargv["x"])) + commit_ref = ops.commit(index_ref, result, message="prepop function result") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/assets/internal_fn/python-fork-adapter.py b/tests/assets/internal_fn/python-fork-adapter.py new file mode 100755 index 0000000..4e85e26 --- /dev/null +++ b/tests/assets/internal_fn/python-fork-adapter.py @@ -0,0 +1,51 @@ +import json +import shutil +import subprocess +import sys +from urllib.parse import urlparse + + +def _run_adapter(adapter_name: str, payload: dict) -> int: + adapter_path = shutil.which(adapter_name) if "/" not in adapter_name else adapter_name + if adapter_path is None: + sys.stderr.write(f"No such adapter: {adapter_name}\n") + return 1 + cmd = [adapter_path] + if adapter_path.endswith(".py"): + cmd = [sys.executable, adapter_path] + completed = subprocess.run(cmd, input=json.dumps(payload), text=True, capture_output=True, check=False) + sys.stdout.write(completed.stdout) + sys.stderr.write(completed.stderr) + return completed.returncode + + +def _run_target(target: str, raw: str) -> int: + script = urlparse(target).path + completed = subprocess.run([sys.executable, script], input=raw, text=True, capture_output=True, check=False) + sys.stdout.write(completed.stdout) + sys.stderr.write(completed.stderr) + return completed.returncode + + +def main() -> None: + raw = sys.stdin.read() + payload = json.loads(raw) + runnable = payload.get("runnable", {}) + sub = runnable.get("sub") + if sub is not None: + forwarded = { + "argv_ptr": payload.get("argv_ptr"), + "cache_key": payload.get("cache_key"), + "execution_id": payload.get("execution_id"), + "remote": payload.get("remote"), + "state": None, + "runnable": sub, + } + code = _run_adapter(sub.get("adapter", ""), forwarded) + raise SystemExit(code) + code = _run_target(runnable.get("target", ""), raw) + raise SystemExit(code) + + +if __name__ == "__main__": + main() diff --git a/tests/assets/internal_fn/rand.py b/tests/assets/internal_fn/rand.py new file mode 100644 index 0000000..3981188 --- /dev/null +++ b/tests/assets/internal_fn/rand.py @@ -0,0 +1,45 @@ +import json +import sys +import tempfile +from pathlib import Path +from uuid import uuid4 + +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.types import DEFAULT_HEAD, NAMESPACES, Commit, Tree + + +def _init_repo(db: DmlDbEnv) -> None: + with BaseOps(db)._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="initial")) + HeadOps(_db=db).create_branch(DEFAULT_HEAD, commit_ref) + + +if __name__ == "__main__": + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + _init_repo(db) + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + result = ops.put_literal(index_ref, str(uuid4())) + commit_ref = ops.commit(index_ref, result, message="rand function result") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/assets/internal_fn/sum.py b/tests/assets/internal_fn/sum.py new file mode 100644 index 0000000..cf207af --- /dev/null +++ b/tests/assets/internal_fn/sum.py @@ -0,0 +1,44 @@ +import json +import sys +import tempfile +from pathlib import Path +from typing import cast + +from daggerml._internal._db import DmlDbEnv +from daggerml._internal.execution_context import execution_context +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import NAMESPACES, Error + +if __name__ == "__main__": + envelope = json.loads(sys.stdin.read()) + remote = envelope["remote"] + argv_ptr = envelope["argv_ptr"] + execution_id = envelope["execution_id"] + cache_key = envelope["cache_key"] + remote_root = remote["root"] + with execution_context(execution_id, cache_key): + with tempfile.TemporaryDirectory(prefix="dml-fn-") as tmpdir: + db_path = Path(tmpdir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db = DmlDbEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + ops = IndexOps(db, remote_root=remote_root) + index_ref = ops.create(argv_ptr=argv_ptr) + node_ops = NodeOps(db) + argv = cast(list, node_ops.unroll(ops.get_argv(index_ref))) + _, *args = argv + try: + for i, arg in enumerate(args): + if not isinstance(arg, (int, float)): + raise TypeError(f"Argument at index {i} is {type(arg).__name__}, expected int or float") + result = ops.put_literal(index_ref, float(sum(args))) + except Exception as e: + result = Error.from_ex(e) + commit_ref = ops.commit(index_ref, result, message="sum function result") + with ops._tx(readonly=True) as txn: + commit_obj = txn.get(commit_ref) + dag_id = commit_obj.dag.id() + print(json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}, separators=(",", ":"))) + finally: + db.close() diff --git a/tests/conftest.py b/tests/conftest.py index 255f745..19bcfc2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,20 +6,62 @@ import pytest -from daggerml import Dml +from tests import temporary_dml + + +@pytest.fixture(scope="session") +def _aws_server(): + with patch.dict(os.environ): + for key in list(os.environ.keys()): + if key.startswith("AWS_"): + del os.environ[key] + from moto.server import ThreadedMotoServer + + server = ThreadedMotoServer(port=0, verbose=False) + server.start() + host, port = server.get_host_and_port() + try: + yield { + "endpoint": f"http://{host}:{port}", + "envvars": { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_ENDPOINT_URL": f"http://{host}:{port}", + }, + } + finally: + server.stop() @pytest.fixture(autouse=True) def clear_envvars(): with patch.dict(os.environ): # Clear AWS environment variables before any tests run - for k in os.environ: + for k in list(os.environ.keys()): if k.startswith("AWS_") or k.startswith("DML_"): del os.environ[k] os.environ["AWS_SHARED_CREDENTIALS_FILE"] = "/dev/null" yield +@pytest.fixture(autouse=True) +def remote_env(clear_envvars, _aws_server): + import boto3 + + os.environ.update(_aws_server["envvars"]) + os.environ["DML_REMOTE_ROOT"] = "s3://test-bucket/test-prefix" + boto3.setup_default_session() + endpoint = _aws_server["endpoint"] + s3 = boto3.client("s3", endpoint_url=endpoint) + try: + s3.create_bucket(Bucket="test-bucket") + except Exception: + pass + yield + + @pytest.fixture(autouse=True) def debug(clear_envvars): """Fixture to set debug mode for tests.""" @@ -29,15 +71,16 @@ def debug(clear_envvars): @pytest.fixture -def dml(tmpdir): - with Dml.temporary(cache_path=str(tmpdir)) as _dml: - with patch.dict(os.environ, DML_FN_CACHE_DIR=_dml.kwargs["config_dir"], **_dml.envvars): +def dml(): + with temporary_dml() as _dml: + # Set function cache dir to repo so tests can find debug files + with patch.dict(os.environ, DML_TEST_FN_STATE_DIR=_dml._context.project_home): yield _dml @pytest.fixture def fake_dml(): # patches Dml and Dag so that neither does anything - with patch("daggerml.core.Dml", autospec=True) as mock_dml: - with patch("daggerml.core.Dag", autospec=True) as mock_dag: + with patch("daggerml.api.Dml", autospec=True) as mock_dml: + with patch("daggerml.api.Dag", autospec=True) as mock_dag: yield mock_dml, mock_dag diff --git a/tests/contracts/contrib/executor/test_executor_base_handle.py b/tests/contracts/contrib/executor/test_executor_base_handle.py new file mode 100644 index 0000000..3e4c482 --- /dev/null +++ b/tests/contracts/contrib/executor/test_executor_base_handle.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import pytest + +from daggerml._internal.types import Runnable, Uri +from daggerml.contrib.executors._base import ExecutorBase + + +def _runnable() -> Runnable: + return Runnable(target=Uri("test"), kwargs={}, adapter="test-adapter") + + +def _remote() -> dict[str, str]: + return {"root": "s3://test-bucket/test-prefix"} + + +class MockExecutor(ExecutorBase): + name = "mock" + adapter = "local" + calls: list[str] = [] + + def start(self, *, cache_key, execution_id, runnable, argv_ptr, remote): + MockExecutor.calls.append("start") + return {"status": "running", "error": None, "state": {"token": execution_id}} + + def poll(self, *, cache_key, execution_id, state, remote): + MockExecutor.calls.append("poll") + return {"status": "running", "error": None, "state": state} + + +class TerminalStartExecutor(MockExecutor): + def start(self, *, cache_key, execution_id, runnable, argv_ptr, remote): + MockExecutor.calls.append("start") + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + +class CancelExecutor(MockExecutor): + def cancel(self, *, cache_key, execution_id, state, remote): + MockExecutor.calls.append("cancel") + return {"status": "cancel-detached", "error": None} + + +@pytest.fixture(autouse=True) +def reset_calls(): + MockExecutor.calls = [] + yield + + +@pytest.mark.parametrize( + "contract_id,state,expected_start_calls,expected_poll_calls,stage", + [ + pytest.param("EXB-HDL-001", None, 1, 0, "kickoff", id="EXB-HDL-001:kickoff-uses-start"), + pytest.param("EXB-HDL-002", {"some": "state"}, 0, 1, "resume", id="EXB-HDL-002:resume-uses-poll"), + ], +) +def test_executor_base_handle_lifecycle_stage_matrix_EXB_HDL_001_EXB_HDL_002( + contract_id, state, expected_start_calls, expected_poll_calls, stage +): + del contract_id, stage + result = MockExecutor.handle( + cache_key="ck-stage", + execution_id="exec-stage", + state=state, + execution_status=None, + cancel_requested_by=None, + runnable=_runnable(), + argv_ptr="ptr", + remote=_remote(), + ) + assert MockExecutor.calls.count("start") == expected_start_calls + assert MockExecutor.calls.count("poll") == expected_poll_calls + assert result["status"] == "running" + + +def test_executor_base_handle_EXB_HDL_003_returns_terminal_start_result_directly(): + result = TerminalStartExecutor.handle( + cache_key="ck-terminal", + execution_id="exec-terminal", + state=None, + execution_status=None, + cancel_requested_by=None, + runnable=_runnable(), + argv_ptr="ptr", + remote=_remote(), + ) + assert MockExecutor.calls.count("start") == 1 + assert result["status"] == "succeeded" + assert result.get("dag_id") == "a" * 64 + + +def test_executor_base_handle_EXB_HDL_004_routes_mixed_state_invocations_correctly(): + class TrackingExecutor(ExecutorBase): + name = "tracking" + adapter = "local" + + def start(self, *, cache_key, execution_id, runnable, argv_ptr, remote): + MockExecutor.calls.append("start") + return {"status": "running", "error": None, "state": {"token": execution_id}} + + def poll(self, *, cache_key, execution_id, state, remote): + MockExecutor.calls.append("poll") + return {"status": "running", "error": None, "state": state} + + kickoff = TrackingExecutor.handle( + cache_key="ck-mixed", + execution_id="exec-mixed", + state=None, + execution_status=None, + cancel_requested_by=None, + runnable=_runnable(), + argv_ptr="ptr", + remote=_remote(), + ) + resumed = TrackingExecutor.handle( + cache_key="ck-mixed", + execution_id="exec-mixed", + state={"existing": "state"}, + execution_status=None, + cancel_requested_by=None, + runnable=_runnable(), + argv_ptr="ptr", + remote=_remote(), + ) + assert MockExecutor.calls.count("start") == 1 + assert MockExecutor.calls.count("poll") == 1 + assert kickoff["status"] == "running" + assert resumed["status"] == "running" + + +def test_executor_base_handle_routes_cancel_requested_updates_to_cancel(): + result = CancelExecutor.handle( + cache_key="ck-cancel", + execution_id="exec-cancel", + state={"pid": 1}, + execution_status="cancel-pending", + cancel_requested_by="alice@example.com", + runnable=_runnable(), + argv_ptr="ptr", + remote=_remote(), + ) + assert MockExecutor.calls == ["cancel"] + assert result == {"status": "cancel-detached", "error": None} diff --git a/tests/contracts/contrib/executor/test_ssh_handle.py b/tests/contracts/contrib/executor/test_ssh_handle.py new file mode 100644 index 0000000..cba65ce --- /dev/null +++ b/tests/contracts/contrib/executor/test_ssh_handle.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import json +import os +import subprocess +from typing import Any, cast + +import pytest + +from daggerml._internal.types import Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import LocalAdapter +from daggerml.contrib.executors import ScriptExecutor, SshExecutor + + +@pytest.fixture(autouse=True) +def _reset_registries(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + monkeypatch.setenv("DML_REMOTE_ROOT", "s3://test-bucket/test-prefix") + areg.register_adapter(LocalAdapter) + ereg.register_executor(ScriptExecutor) + ereg.register_executor(SshExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +def _remote() -> dict[str, str]: + return {"root": os.environ["DML_REMOTE_ROOT"]} + + +def _sub_runnable() -> Runnable: + return Runnable(target=Uri("script"), adapter="dml-local-adapter", kwargs={"x": 1}, sub=None) + + +def _ssh_runnable() -> Runnable: + return Runnable( + target=Uri("ssh"), + adapter="dml-local-adapter", + kwargs={"host": "worker.example", "flags": ["-p", "2222"], "env_files": ["/etc/dml.env"]}, + sub=_sub_runnable(), + ) + + +@pytest.mark.parametrize( + "contract_id,transport_stdout,transport_returncode,transport_stderr,stage,expected_status,expected_error", + [ + pytest.param( + "SSH-HDL-001", + json.dumps({"status": "succeeded", "error": None, "dag_id": "d" * 64}).encode(), + 0, + b"", + "kickoff", + "succeeded", + None, + id="SSH-HDL-001:kickoff-forwards-envelope-and-projects-success", + ), + pytest.param( + "SSH-HDL-002", + b"", + 255, + b"permission denied", + "terminal-failed", + "failed", + "SSH command failed (255): permission denied", + id="SSH-HDL-002:transport-nonzero-projects-failed", + ), + pytest.param( + "SSH-HDL-003", + b'{"status":"running","error":null,"state":{}}', + 0, + b"", + "resume", + "running", + None, + id="SSH-HDL-003:running-child-result-passes-through", + ), + pytest.param( + "SSH-HDL-004", + b'{"status":"failed","error":"child boom"}', + 0, + b"", + "terminal-failed", + "failed", + "child boom", + id="SSH-HDL-004:child-failure-projects-unchanged", + ), + ], +) +def test_ssh_executor_handle_stage_matrix_SSH_HDL_001_to_SSH_HDL_004( + monkeypatch, + contract_id, + transport_stdout, + transport_returncode, + transport_stderr, + stage, + expected_status, + expected_error, +): + del contract_id, stage + runnable = _ssh_runnable() + seen: dict[str, Any] = {} + + def _fake_run(cmd, input=None, capture_output=None, check=None): + seen["cmd"] = cmd + seen["payload"] = json.loads(cast(bytes, input).decode("utf-8")) + return subprocess.CompletedProcess( + args=cmd, + returncode=transport_returncode, + stdout=transport_stdout, + stderr=transport_stderr, + ) + + monkeypatch.setattr(subprocess, "run", _fake_run) + + result = SshExecutor.handle( + runnable=runnable, + argv_ptr="s3://bucket/argv", + cache_key="ck-ssh-handle", + execution_id="exec-ssh-handle", + remote=_remote(), + state=None, + ) + + assert result["status"] == expected_status + if expected_error is None: + assert result.get("error") is None + else: + assert expected_error in result["error"] + + assert seen["cmd"][:4] == ["ssh", "-p", "2222", "worker.example"] + assert seen["cmd"][4].startswith("set -e; . /etc/dml.env; exec dml-local-adapter") + assert ". /etc/dml.env" in seen["cmd"][4] + assert "DML_REMOTE_ROOT" not in seen["cmd"][4] + assert "--poll" in seen["cmd"][4] + assert seen["payload"]["runnable"]["target"] == "script" + assert seen["payload"]["cache_key"] == "ck-ssh-handle" + assert seen["payload"]["argv_ptr"] == "s3://bucket/argv" + assert seen["payload"]["execution_id"] == "exec-ssh-handle" + assert seen["payload"]["state"] is None + + +def test_ssh_executor_handle_SSH_HDL_005_forwards_runtime_state_to_transport(monkeypatch): + runnable = Runnable( + target=Uri("ssh"), + adapter="dml-local-adapter", + kwargs={"host": "worker.example"}, + sub=_sub_runnable(), + ) + + def _fake_run(cmd, input=None, capture_output=None, check=None): + del cmd, capture_output, check + payload = json.loads(cast(bytes, input).decode("utf-8")) + assert payload["state"] == {"job_id": "123"} + return subprocess.CompletedProcess( + args=[], + returncode=0, + stdout=json.dumps({"status": "succeeded", "error": None, "dag_id": "d" * 64}).encode(), + stderr=b"", + ) + + monkeypatch.setattr(subprocess, "run", _fake_run) + + result = SshExecutor.handle( + cache_key="ck-ssh-handle", + execution_id="exec-ssh-handle", + state={"job_id": "123"}, + runnable=runnable, + argv_ptr="s3://bucket/argv", + remote=_remote(), + ) + assert result["status"] == "succeeded" + + +def test_ssh_executor_handle_forwards_cancel_update_fields(monkeypatch): + runnable = _ssh_runnable() + + def _fake_run(cmd, input=None, capture_output=None, check=None): + del cmd, capture_output, check + payload = json.loads(cast(bytes, input).decode("utf-8")) + assert payload["execution_status"] == "cancel-pending" + assert payload["cancel_requested_by"] == "alice@example.com" + return subprocess.CompletedProcess( + args=[], + returncode=0, + stdout=json.dumps({"status": "cancel-detached", "error": None}).encode(), + stderr=b"", + ) + + monkeypatch.setattr(subprocess, "run", _fake_run) + + result = SshExecutor.handle( + cache_key="ck-ssh-cancel", + execution_id="exec-ssh-cancel", + state={"job_id": "123"}, + execution_status="cancel-pending", + cancel_requested_by="alice@example.com", + runnable=runnable, + argv_ptr="s3://bucket/argv", + remote=_remote(), + ) + assert result == {"status": "cancel-detached", "error": None} diff --git a/tests/contracts/contrib/executor/test_ssh_resolve_runnable.py b/tests/contracts/contrib/executor/test_ssh_resolve_runnable.py new file mode 100644 index 0000000..2ccef59 --- /dev/null +++ b/tests/contracts/contrib/executor/test_ssh_resolve_runnable.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import pytest + +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import LocalAdapter +from daggerml.contrib.executors import ScriptExecutor, SshExecutor + + +@pytest.fixture(autouse=True) +def _reset_registries(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + monkeypatch.setenv("DML_REMOTE_ROOT", "s3://test-bucket/test-prefix") + areg.register_adapter(LocalAdapter) + ereg.register_executor(ScriptExecutor) + ereg.register_executor(SshExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +def _sub_runnable() -> Runnable: + return Runnable(target=Uri("script"), adapter="dml-local-adapter", kwargs={"x": 1}, sub=None) + + +def test_local_adapter_resolve_runnable_SSH_RES_001_returns_expected_ssh_runnable_shape(): + sub = _sub_runnable() + result = LocalAdapter.resolve_runnable( + "ssh", + {"host": "worker.example", "flags": ["-p", "2222"], "env_files": ["/etc/dml.env"]}, + sub, + ) + + assert isinstance(result, Runnable) + assert result.target.uri == "ssh" + assert result.adapter == "dml-local-adapter" + assert result.sub is sub + assert result.kwargs == { + "host": "worker.example", + "flags": ["-p", "2222"], + "env_files": ["/etc/dml.env"], + } + + +@pytest.mark.parametrize( + "kwargs,sub,expected_error", + [ + pytest.param({"host": "worker.example"}, None, "requires sub runnable", id="SSH-RES-002:missing-sub-runnable"), + pytest.param({}, _sub_runnable(), "requires non-empty host", id="SSH-RES-002:missing-host"), + pytest.param( + {"host": "worker.example", "user": "alice"}, + _sub_runnable(), + "Unknown ssh executor kwargs", + id="SSH-RES-002:rejects-unknown-kwargs", + ), + ], +) +def test_local_adapter_resolve_runnable_SSH_RES_002_rejects_invalid_inputs(kwargs, sub, expected_error): + with pytest.raises(DmlRepoError, match=expected_error): + LocalAdapter.resolve_runnable("ssh", kwargs, sub) diff --git a/tests/contracts/contrib/test_adapter_cli_contract.py b/tests/contracts/contrib/test_adapter_cli_contract.py new file mode 100644 index 0000000..0b0e5d0 --- /dev/null +++ b/tests/contracts/contrib/test_adapter_cli_contract.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import json + +from daggerml._internal.exec_state import ExecutionState +from daggerml._internal.types import Runnable, Uri +from daggerml.contrib.adapters import AdapterBase + + +def test_adapter_cli_poll_preserves_launch_state_over_execution_record_state(monkeypatch, capsys): + seen_states = [] + + class DummyAdapter(AdapterBase): + @classmethod + def send( + cls, *, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by + ): + del runnable, argv_ptr, cache_key, execution_id, remote, execution_status, cancel_requested_by + seen_states.append(state) + if len(seen_states) == 1: + return {"status": "running", "error": None, "state": {"result_path": "/tmp/result.json"}} + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + monkeypatch.setattr( + ExecutionState, + "read_launch_state", + lambda self, execution_id: { + "resume_state": {"container_id": "cid-123"}, + "created_at": 1, + "execution_id": execution_id, + "cache_key": self.cache_key, + }, + ) + monkeypatch.setattr( + ExecutionState, + "read_execution_record", + lambda self, execution_id: { + "execution_id": execution_id, + "cache_key": self.cache_key, + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + }, + ) + monkeypatch.setattr("daggerml.contrib.adapters.time.sleep", lambda _: None) + monkeypatch.setattr( + "sys.stdin.read", + lambda: DummyAdapter._dump_payload( + runnable=Runnable(target=Uri("dummy"), adapter="dummy", kwargs={}), + argv_ptr="ptr", + cache_key="ck", + execution_id="exec-ck", + remote={"root": "s3://bucket/root"}, + state=None, + ).decode("utf-8"), + ) + + exit_code = DummyAdapter.cli(["--poll"]) + + assert exit_code == 0 + assert seen_states == [None, {"result_path": "/tmp/result.json"}] + assert json.loads(capsys.readouterr().out.strip()) == {"status": "succeeded", "error": None, "dag_id": "a" * 64} diff --git a/tests/contracts/contrib/test_adapter_registry_contract.py b/tests/contracts/contrib/test_adapter_registry_contract.py new file mode 100644 index 0000000..5fc638d --- /dev/null +++ b/tests/contracts/contrib/test_adapter_registry_contract.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +import pytest + +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.contrib import adapter_registry as reg +from daggerml.contrib.adapters import LambdaAdapter + + +@dataclass +class AdapterSpec: + name: str + executable: str = "x" + + def resolve_runnable(self, uri, kwargs, sub): + return (uri, kwargs, sub) + + @staticmethod + def send(*, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + +@pytest.fixture(autouse=True) +def _reset_registry(): + reg._reset_for_tests() + yield + reg._reset_for_tests() + + +def test_register_get_and_list_adapter(): + reg.register_adapter(AdapterSpec("custom")) + loaded = reg.get_adapter("custom") + assert loaded.name == "custom" + assert reg.list_adapters() == ["custom", "lambda", "local"] + + +def test_register_adapter_accepts_class_object(): + class CustomAdapter: + name = "custom" + executable = "custom-exec" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return (uri, kwargs, sub) + + @staticmethod + def send(*, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + reg.register_adapter(CustomAdapter) + loaded = reg.get_adapter("custom") + assert loaded is CustomAdapter + + +def test_register_adapter_missing_required_attribute_fails(): + class BadAdapter: + name = "bad" + executable = "bad-exec" + + with pytest.raises(DmlRepoError, match="missing required attribute: resolve_runnable"): + reg.register_adapter(BadAdapter) + + +def test_get_unknown_adapter_fails(): + with pytest.raises(DmlRepoError, match="Adapter 'missing' is not registered"): + reg.get_adapter("missing") + + +class _FakeEntryPoint: + def __init__(self, name: str, value: str, loaded): + self.name = name + self.value = value + self._loaded = loaded + + def load(self): + return self._loaded + + +def test_plugin_loading_contract_variants(monkeypatch): + def _factory(): + return AdapterSpec("batch") + + monkeypatch.setattr( + reg, + "_entry_points", + lambda: [ + _FakeEntryPoint("a", "mod:a", AdapterSpec("local")), + _FakeEntryPoint("b", "mod:b", [AdapterSpec("docker"), _factory]), + ], + ) + + assert reg.list_adapters() == ["batch", "docker", "local"] + + +def test_plugin_loading_invalid_return_fails(monkeypatch): + monkeypatch.setattr(reg, "_entry_points", lambda: [_FakeEntryPoint("bad", "mod:bad", object())]) + + with pytest.raises(DmlRepoError, match="returned invalid adapter registration"): + reg.load_adapter_plugins() + + +def test_lambda_adapter_invokes_runnable_target(monkeypatch): + seen = {} + + class _Payload: + def read(self): + return ( + b'{"status":"succeeded","error":null,' + b'"dag_id":"dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"}' + ) + + class _Client: + def invoke(self, **kwargs): + seen.update(kwargs) + return {"Payload": _Payload()} + + monkeypatch.setattr("daggerml.contrib.adapters.get_client", lambda name: _Client()) + + result = LambdaAdapter.send( + runnable=Runnable(target=Uri("lambda-fn"), adapter="dml-lambda-adapter", kwargs={}), + argv_ptr="ptr", + cache_key="ck", + execution_id="exec-ck", + remote={}, + state=None, + execution_status=None, + cancel_requested_by=None, + ) + + assert result == {"status": "succeeded", "error": None, "dag_id": "d" * 64} + assert seen["FunctionName"] == "lambda-fn" + + +def test_pyproject_declares_builtin_adapter_entry_points(): + pyproject = (Path(__file__).resolve().parents[3] / "pyproject.toml").read_text() + + assert '[project.entry-points."daggerml.contrib.adapters"]' in pyproject + assert 'local = "daggerml.contrib.adapters:LocalAdapter"' in pyproject + assert 'lambda = "daggerml.contrib.adapters:LambdaAdapter"' in pyproject diff --git a/tests/contracts/contrib/test_batch_executor_contract.py b/tests/contracts/contrib/test_batch_executor_contract.py new file mode 100644 index 0000000..3ba2426 --- /dev/null +++ b/tests/contracts/contrib/test_batch_executor_contract.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import json + +import pytest + +from daggerml import Uri +from daggerml._internal.exec_state import ExecutionState +from daggerml._internal.types import DmlRepoError, Runnable +from daggerml.contrib.executors.batch import _ADAPTER_IO_NAME, BatchExecutor + + +class _FakeBatchClient: + def __init__(self, *, jobs=None): + self.jobs = jobs or [{"status": "RUNNING"}] + self.registered = [] + self.submitted = [] + self.terminated = [] + self.canceled = [] + self.deregistered = [] + + def register_job_definition(self, **kwargs): + self.registered.append(kwargs) + return {"jobDefinitionArn": "arn:batch:def/123"} + + def submit_job(self, **kwargs): + self.submitted.append(kwargs) + return {"jobId": "job-123"} + + def describe_jobs(self, **kwargs): + return {"jobs": self.jobs} + + def terminate_job(self, **kwargs): + self.terminated.append(kwargs) + + def cancel_job(self, **kwargs): + self.canceled.append(kwargs) + + def deregister_job_definition(self, **kwargs): + self.deregistered.append(kwargs) + + +@pytest.fixture(autouse=True) +def _setup(monkeypatch, tmp_path): + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path)) + monkeypatch.setenv("CPU_QUEUE", "cpu-q") + monkeypatch.setenv("GPU_QUEUE", "gpu-q") + monkeypatch.setenv("BATCH_TASK_ROLE_ARN", "arn:role/batch") + + +def _sub() -> Runnable: + return Runnable(target=Uri("script"), adapter="dml-local-adapter", kwargs={"x": 1}) + + +_REMOTE = {"root": "s3://test-bucket/test-prefix"} + + +def test_batch_executor_resolve_runnable_shape(): + runnable = BatchExecutor.resolve_runnable( + "batch", + {"lambda_uri": "lambda-fn", "image": Uri("repo/image:tag"), "cpu": 2, "memory": 2048, "gpu": 1}, + _sub(), + ) + + assert runnable.target.uri == "lambda-fn" + assert runnable.adapter == "dml-lambda-adapter" + assert runnable.kwargs == {"image": Uri("repo/image:tag"), "cpu": 2, "memory": 2048, "gpu": 1} + assert runnable.sub is not None + + +def test_batch_executor_resolve_runnable_rejects_bad_input(): + with pytest.raises(DmlRepoError, match="requires sub runnable"): + BatchExecutor.resolve_runnable("batch", {"lambda_uri": "lambda-fn", "image": Uri("img")}, None) + with pytest.raises(DmlRepoError, match="Unknown batch executor kwargs"): + BatchExecutor.resolve_runnable("batch", {"lambda_uri": "lambda-fn", "image": Uri("img"), "oops": 1}, _sub()) + with pytest.raises(DmlRepoError, match="image must be a Uri"): + BatchExecutor.resolve_runnable("batch", {"lambda_uri": "lambda-fn", "image": "img"}, _sub()) + + +def test_batch_executor_start_submits_job_and_writes_state(monkeypatch): + fake_client = _FakeBatchClient() + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + runnable = BatchExecutor.resolve_runnable( + "batch", {"lambda_uri": "lambda-fn", "image": Uri("repo/image:tag")}, _sub() + ) + + cache_key = "batch-start" + execution_id = "exec-batch-start" + argv_ptr = "argv://ptr" + + executor = BatchExecutor() + result = executor.start( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=_REMOTE, + ) + + assert result["status"] == "running" + written_state = result["state"] + assert fake_client.registered + assert fake_client.submitted == [ + {"jobName": "dml-batch-batch-start", "jobQueue": "cpu-q", "jobDefinition": "arn:batch:def/123"} + ] + assert written_state["job_id"] == "job-123" + assert written_state["job_definition"] == "arn:batch:def/123" + # input_uri and output_uri must NOT be in state (derived from AdapterIO) + assert "input_uri" not in written_state + assert "output_uri" not in written_state + + +def test_batch_executor_start_writes_input_payload_to_s3(monkeypatch): + fake_client = _FakeBatchClient() + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + runnable = BatchExecutor.resolve_runnable( + "batch", {"lambda_uri": "lambda-fn", "image": Uri("repo/image:tag")}, _sub() + ) + + cache_key = "batch-payload" + execution_id = "exec-payload" + remote = _REMOTE + + BatchExecutor().start( + runnable=runnable, + argv_ptr="argv://ptr", + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + ) + + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, _ADAPTER_IO_NAME) + raw = exec_state._get_object_bytes(io._input_key) + assert raw is not None + payload = json.loads(raw[0]) + assert payload["cache_key"] == cache_key + assert payload["execution_id"] == execution_id + + +def test_batch_executor_start_passes_s3_uris_to_container_command(monkeypatch): + fake_client = _FakeBatchClient() + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + runnable = BatchExecutor.resolve_runnable( + "batch", {"lambda_uri": "lambda-fn", "image": Uri("repo/image:tag")}, _sub() + ) + + BatchExecutor().start( + runnable=runnable, + argv_ptr="argv://ptr", + cache_key="batch-cmd", + execution_id="exec-cmd", + remote=_REMOTE, + ) + + container_props = fake_client.registered[0]["containerProperties"] + cmd = container_props["command"] + # Command must include --poll, -i , -o + assert "--poll" in cmd + i_idx = cmd.index("-i") + o_idx = cmd.index("-o") + assert cmd[i_idx + 1].startswith("s3://") + assert cmd[o_idx + 1].startswith("s3://") + + +def test_batch_executor_poll_returns_running_while_batch_running(monkeypatch): + fake_client = _FakeBatchClient(jobs=[{"status": "RUNNING"}]) + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + executor = BatchExecutor() + result = executor.poll( + cache_key="batch-poll", + execution_id="exec-batch-poll", + state={"job_id": "job-123"}, + remote=_REMOTE, + ) + + assert result["status"] == "running" + assert result["state"]["job_id"] == "job-123" + + +def test_batch_executor_poll_returns_succeeded_when_batch_succeeded(monkeypatch): + fake_client = _FakeBatchClient(jobs=[{"status": "SUCCEEDED"}]) + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + cache_key = "batch-poll-ok" + execution_id = "exec-batch-ok" + remote = _REMOTE + dag_id = "a" * 64 + sub_result = {"status": "succeeded", "error": None, "dag_id": dag_id} + + # Pre-write result to S3 via AdapterIO + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, _ADAPTER_IO_NAME) + exec_state._put_object(io._output_key, json.dumps(sub_result).encode()) + + executor = BatchExecutor() + result = executor.poll( + cache_key=cache_key, + execution_id=execution_id, + state={"job_id": "job-123"}, + remote=remote, + ) + + assert result["status"] == "succeeded" + assert result["dag_id"] == dag_id + + +def test_batch_executor_poll_returns_failed_when_output_absent(monkeypatch): + fake_client = _FakeBatchClient(jobs=[{"status": "SUCCEEDED"}]) + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + result = BatchExecutor().poll( + cache_key="batch-no-out", + execution_id="exec-no-out", + state={"job_id": "job-123"}, + remote=_REMOTE, + ) + + assert result["status"] == "failed" + assert "output not yet written" in result["error"] + + +def test_batch_executor_poll_reads_batch_failure_reason(monkeypatch): + fake_client = _FakeBatchClient(jobs=[{"status": "FAILED", "statusReason": "boom", "attempts": []}]) + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + executor = BatchExecutor() + result = executor.poll( + cache_key="batch-fail", + execution_id="exec-batch-fail", + state={"job_id": "job-123"}, + remote=_REMOTE, + ) + + assert result["status"] == "failed" + assert result["error"] is not None + assert "Batch job job-123 failed: boom" in result["error"] + + +def test_batch_executor_poll_returns_failed_for_missing_job_id(): + executor = BatchExecutor() + result = executor.poll( + cache_key="batch-no-id", + execution_id="exec-batch-no-id", + state={}, + remote=_REMOTE, + ) + + assert result["status"] == "failed" + assert "job_id" in result["error"] + + +def test_batch_executor_cancel_cleans_up_backend_resources(monkeypatch): + fake_client = _FakeBatchClient() + monkeypatch.setattr(BatchExecutor, "_client", staticmethod(lambda: fake_client)) + + result = BatchExecutor().cancel( + cache_key="batch-cancel", + execution_id="exec-batch-cancel", + state={"job_id": "job-123", "job_definition": "arn:batch:def/123"}, + remote=_REMOTE, + ) + + assert result == {"status": "cancel-detached", "error": None} + assert fake_client.canceled == [{"jobId": "job-123", "reason": "daggerml cancellation requested"}] + assert fake_client.deregistered == [{"jobDefinition": "arn:batch:def/123"}] diff --git a/tests/contracts/contrib/test_cfn_executor_contract.py b/tests/contracts/contrib/test_cfn_executor_contract.py new file mode 100644 index 0000000..62c0827 --- /dev/null +++ b/tests/contracts/contrib/test_cfn_executor_contract.py @@ -0,0 +1,311 @@ +from __future__ import annotations + +from contextlib import contextmanager + +import pytest + +from daggerml import Runnable, Uri +from daggerml.contrib.executors.cfn import CfnExecutor + + +class _FakeDag: + pass + + +class _FakeRef: + def __init__(self, dag_id): + self._dag_id = dag_id + + def id(self): + return self._dag_id + + +class _FakeDml: + def __init__(self, dag, calls): + self._dag = dag + self._calls = calls + self.runtime = type("_Runtime", (), {"create": self._create})() + + def _create(self, *, argv_ptr): + self._calls.append(("runtime.create", argv_ptr)) + return "index-1" + + +@contextmanager +def _fake_new(*, dml=None, name="", message="", argv_ptr=None, fake_dag=None): + del name, message + assert dml is not None + dml.runtime.create(argv_ptr=argv_ptr) + yield fake_dag + + +_REMOTE = {"root": "s3://bucket/root"} + + +def test_cfn_tmpdag_is_context_manager(monkeypatch): + dag = _FakeDag() + calls = [] + + @contextmanager + def _temporary(*, remote_root, name): + calls.append(("temporary", remote_root, name)) + yield _FakeDml(dag, calls) + + monkeypatch.setattr( + "daggerml.contrib.executors.cfn.temporary", + lambda **kwargs: _temporary(**kwargs), + ) + monkeypatch.setattr("daggerml.contrib.executors.cfn.new", lambda **kwargs: _fake_new(**kwargs, fake_dag=dag)) + + with CfnExecutor._tmpdag("argv://ptr", remote_root=_REMOTE["root"]) as result: + assert result is dag + + assert calls == [ + ("temporary", _REMOTE["root"], calls[0][2]), + ("runtime.create", "argv://ptr"), + ] + + +def test_cfn_tmpdag_propagates_setup_errors(monkeypatch): + @contextmanager + def _temporary(*, remote_root, name): + assert remote_root == _REMOTE["root"] + assert name + raise RuntimeError("boom") + yield + + monkeypatch.setattr( + "daggerml.contrib.executors.cfn.temporary", + lambda **kwargs: _temporary(**kwargs), + ) + + with pytest.raises(RuntimeError, match="boom"): + with CfnExecutor._tmpdag("argv://ptr", remote_root=_REMOTE["root"]): + pass + + +class _ArgvValue: + def __init__(self, values): + self._values = values + + def __getitem__(self, item): + return _ArgvValue(self._values[item]) + + def value(self): + return self._values + + +class _ArgvDag: + def __init__(self, values): + self.argv = _ArgvValue(values) + + +class _StartDml: + def __init__(self, dag): + self._dag = dag + self.runtime = type("_Runtime", (), {"create": self._create})() + + def _create(self, *, argv_ptr): + assert argv_ptr == "argv://ptr" + return "index-1" + + +def test_cfn_start_uses_existing_stack_id_on_no_update(monkeypatch): + dag = _ArgvDag((None, "stack-name", {"Resources": {}}, {"Param": "Value"})) + + @contextmanager + def _temporary(*, remote_root, name): + assert remote_root == _REMOTE["root"] + assert name + yield _StartDml(dag) + + class _FakeClient: + def describe_stacks(self, *, StackName): + assert StackName == "stack-name" + return {"Stacks": [{"StackId": "stack-123"}]} + + def update_stack(self, **kwargs): + assert kwargs["StackName"] == "stack-name" + raise Exception("No updates are to be performed") + + poll_calls = [] + + def _poll(self, *, cache_key, execution_id, state, remote): + poll_calls.append({"cache_key": cache_key, "execution_id": execution_id, "state": state}) + return {"status": "running", "error": None, "state": state} + + monkeypatch.setattr( + "daggerml.contrib.executors.cfn.temporary", + lambda **kwargs: _temporary(**kwargs), + ) + monkeypatch.setattr("daggerml.contrib.executors.cfn.new", lambda **kwargs: _fake_new(**kwargs, fake_dag=dag)) + monkeypatch.setattr(CfnExecutor, "_client", staticmethod(lambda: _FakeClient())) + monkeypatch.setattr(CfnExecutor, "poll", _poll) + + CfnExecutor().start( + cache_key="cache-key", + execution_id="exec-cfn-start", + runnable=Runnable(target=Uri("cfn"), kwargs={}, adapter="dml-local-adapter"), + argv_ptr="argv://ptr", + remote=_REMOTE, + ) + + assert len(poll_calls) == 1 + assert poll_calls[0]["cache_key"] == "cache-key" + assert poll_calls[0]["execution_id"] == "exec-cfn-start" + assert poll_calls[0]["state"] == {"stack_name": "stack-name", "stack_id": "stack-123", "argv_ptr": "argv://ptr"} + + +def test_cfn_commit_dag_returns_committed_dag_id(monkeypatch): + class _CommitDag: + def __init__(self): + self.values = {} + self.ref = _FakeRef("dag-cfn-123") + self.stack_id = None + self.stack_name = None + self.outputs = None + self.committed = None + + def __setitem__(self, key, value): + self.values[key] = value + + def commit(self, value): + self.committed = value + + dag = _CommitDag() + + @contextmanager + def _tmpdag(_argv_ptr, *, remote_root): + assert remote_root == _REMOTE["root"] + yield dag + + monkeypatch.setattr( + CfnExecutor, + "_tmpdag", + classmethod(lambda cls, argv_ptr, remote_root: _tmpdag(argv_ptr, remote_root=remote_root)), + ) + + dag_id = CfnExecutor._commit_dag( + {"argv_ptr": "argv://ptr", "stack_name": "stack-name"}, + {"StackId": "stack-123"}, + {"OutputA": "value-a"}, + remote_root=_REMOTE["root"], + ) + + assert dag_id == "dag-cfn-123" + assert dag.values == {"OutputA": "value-a"} + assert dag.stack_id == "stack-123" + assert dag.stack_name == "stack-name" + assert dag.outputs == {"OutputA": "value-a"} + assert dag.committed == {"OutputA": "value-a"} + + +def test_cfn_poll_marks_success_with_committed_dag_id(monkeypatch): + class _FakeClient: + def describe_stacks(self, *, StackName): + assert StackName == "stack-name" + return { + "Stacks": [ + { + "StackId": "stack-123", + "StackStatus": "CREATE_COMPLETE", + "Outputs": [{"OutputKey": "OutputA", "OutputValue": "value-a"}], + } + ] + } + + commit_calls = [] + + def _commit_dag(cls, metadata, stack, outputs, *, remote_root): + assert remote_root == _REMOTE["root"] + commit_calls.append((metadata, stack, outputs)) + return "dag-cfn-success" + + monkeypatch.setattr(CfnExecutor, "_client", staticmethod(lambda: _FakeClient())) + monkeypatch.setattr(CfnExecutor, "_commit_dag", classmethod(_commit_dag)) + result = CfnExecutor().poll( + cache_key="cache-key", + execution_id="exec-cfn-success", + state={"stack_name": "stack-name", "argv_ptr": "argv://ptr"}, + remote=_REMOTE, + ) + + assert result["status"] == "succeeded" + assert result["dag_id"] == "dag-cfn-success" + assert commit_calls == [ + ( + {"stack_name": "stack-name", "argv_ptr": "argv://ptr"}, + { + "StackId": "stack-123", + "StackStatus": "CREATE_COMPLETE", + "Outputs": [{"OutputKey": "OutputA", "OutputValue": "value-a"}], + }, + {"OutputA": "value-a"}, + ) + ] + + +def test_cfn_cancel_starts_rollback_or_delete_and_reports_cancelled(monkeypatch): + calls = [] + + class _FakeClient: + def cancel_update_stack(self, *, StackName): + calls.append(("cancel_update_stack", StackName)) + + monkeypatch.setattr(CfnExecutor, "_client", staticmethod(lambda: _FakeClient())) + + result = CfnExecutor().cancel( + cache_key="cache-key", + execution_id="exec-cfn-cancel", + state={"stack_name": "stack-name", "argv_ptr": "argv://ptr"}, + remote=_REMOTE, + ) + + assert result == {"status": "cancel-detached", "error": None} + assert calls == [("cancel_update_stack", "stack-name")] + + +def test_cfn_poll_marks_failed_when_stack_is_missing(monkeypatch): + class _FakeClient: + def describe_stacks(self, *, StackName): + assert StackName == "stack-name" + return {"Stacks": []} + + monkeypatch.setattr(CfnExecutor, "_client", staticmethod(lambda: _FakeClient())) + result = CfnExecutor().poll( + cache_key="cache-key", + execution_id="exec-cfn-missing", + state={"stack_name": "stack-name", "argv_ptr": "argv://ptr"}, + remote=_REMOTE, + ) + + assert result["status"] == "failed" + assert result["error"] == "Stack not found: stack-name" + + +def test_cfn_poll_marks_failed_with_stack_event_reasons(monkeypatch): + class _FakeClient: + def describe_stacks(self, *, StackName): + assert StackName == "stack-name" + return {"Stacks": [{"StackId": "stack-123", "StackStatus": "ROLLBACK_COMPLETE"}]} + + def describe_stack_events(self, *, StackName): + assert StackName == "stack-name" + return { + "StackEvents": [ + {"ResourceStatusReason": "First failure"}, + {"LogicalResourceId": "IgnoredWithoutReason"}, + {"ResourceStatusReason": "Second failure"}, + ] + } + + monkeypatch.setattr(CfnExecutor, "_client", staticmethod(lambda: _FakeClient())) + result = CfnExecutor().poll( + cache_key="cache-key", + execution_id="exec-cfn-failed", + state={"stack_name": "stack-name", "argv_ptr": "argv://ptr"}, + remote=_REMOTE, + ) + + assert result["status"] == "failed" + assert result["error"] == "Stack stack-name failed: ROLLBACK_COMPLETE\nFirst failure\nSecond failure" diff --git a/tests/contracts/contrib/test_dagclass_basics_contract.py b/tests/contracts/contrib/test_dagclass_basics_contract.py new file mode 100644 index 0000000..0401c21 --- /dev/null +++ b/tests/contracts/contrib/test_dagclass_basics_contract.py @@ -0,0 +1,490 @@ +from dataclasses import field, is_dataclass +from typing import Any + +import pytest + +from daggerml._internal.types import DmlRepoError +from daggerml.contrib import api + + +def test_dagclass_sets_default_entrypoint_metadata(): + @api.dagclass + class Example: + pass + + obj = Example() + assert is_dataclass(Example) + assert Example.__dict__["__dagclass__"] is True + assert Example.__dict__["__dagclass_entrypoint__"] == "main" + assert obj.__dict__["__dagclass_compiled__"] is True + + +def test_dagclass_sets_custom_entrypoint_metadata(): + @api.dagclass(entrypoint="bar") + class Example: + pass + + obj = Example() + assert Example.__dict__["__dagclass_entrypoint__"] == "bar" + assert obj.__dict__["__dagclass_compiled__"] is True + + +def test_dagclass_compiles_exactly_once_per_instance_init(): + @api.dagclass + class Example: + pass + + obj = Example() + assert obj.__dict__["__dagclass_compile_count__"] == 1 + api._compile_dagclass_instance(obj) + assert obj.__dict__["__dagclass_compile_count__"] == 1 + + +def test_dagclass_behaves_like_dataclass_init_for_fields(): + @api.dagclass + class Example: + x: int + y: int = 2 + + obj = Example(5) + assert obj.x == 5 + assert obj.y == 2 + + +def test_dagclass_calls_user_post_init_transparently(): + @api.dagclass + class Example: + x: int + seen_post_init: bool = False + + def __post_init__(self): + self.seen_post_init = True + + obj = Example(1) + assert obj.seen_post_init is True + assert obj.__dict__["__dagclass_compiled__"] is True + + +def test_field_default_factory_value_materialized_on_init(): + @api.dagclass + class Example: + x: Any = field(default_factory=lambda: 2) + + obj = Example() + assert obj.x == 2 + + +def test_field_default_factory_value_can_be_overridden(): + @api.dagclass + class Example: + x: Any = field(default_factory=lambda: 2) + + obj = Example(9) + assert obj.x == 9 + + +def test_field_default_factory_returning_dagclass_binds_entrypoint(): + @api.dagclass + class Other: + main: api.DelayedRunnable = api.DelayedRunnable(uri="script", adapter="local", sub=None, kwargs={}) + + @api.dagclass + class Host: + foo: Any = field(default_factory=lambda: Other()) + + obj = Host() + assert isinstance(obj.foo, api.DelayedRunnable) + assert obj.foo.uri == "script" + + +def test_direct_class_body_dagclass_assignment_binds_entrypoint(): + @api.dagclass + class Other: + main: api.DelayedRunnable = api.DelayedRunnable(uri="script", adapter="local", sub=None, kwargs={}) + + @api.dagclass + class Host: + foo = Other() + + obj = Host() + assert isinstance(obj.foo, api.DelayedRunnable) + assert obj.foo.uri == "script" + + +def test_dataclasses_field_runs_before_post_init(): + seen = None + + @api.dagclass + class Example: + x: Any = field(default_factory=lambda: 2) + + def __post_init__(self): + nonlocal seen + seen = self.x + + obj = Example() + assert seen == 2 + assert obj.x == 2 + + +def test_plain_method_compiles_to_delayed_runnable_with_inferred_prepop(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a, b=1): + return self.x.value() + a.value() * b.value() + + obj = Example() + assert isinstance(obj.main, api.DelayedRunnable) + assert obj.main.adapter == "local" + assert obj.main.uri == "script" + assert obj.main.kwargs["prepop"] == {"x": api.ref("x")} + + +def test_method_assignment_shadowing_avoids_inferred_prepop(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a, b=1): + self.x = a + return self.x.value() + a.value() * b.value() + + obj = Example() + assert isinstance(obj.main, api.DelayedRunnable) + assert obj.main.kwargs["prepop"] == {} + + +def test_method_dependency_orders_compiled_methods_topologically(): + @api.dagclass + class Example: + x: Any = 2 + + def helper(self): + return self.x.value() + + def main(self): + return self.helper() + + obj = Example() + assert obj.__dagclass_member_order__ == ["x", "helper", "main"] + + +def test_method_call_dependency_is_inferred_into_prepop(): + @api.dagclass + class Example: + x: Any = 2 + + def helper(self): + return self.x.value() + + def main(self): + return self.helper() + + obj = Example() + assert isinstance(obj.main, api.DelayedRunnable) + assert obj.main.kwargs["prepop"] == {"helper": api.ref("helper")} + + +def test_field_ref_dependency_orders_member_materialization(): + @api.dagclass + class Example: + x: Any = api.ref("y") + y: Any = 2 + + obj = Example() + assert obj.__dagclass_member_order__ == ["y", "x"] + + +def test_nested_container_ref_dependency_orders_member_materialization(): + @api.dagclass + class Example: + x: Any = field(default_factory=lambda: {"items": [api.ref("y")]}) + y: Any = 2 + + obj = Example() + assert obj.__dagclass_member_order__ == ["y", "x"] + + +def test_explicit_delayed_runnable_ref_dependency_orders_member_materialization(): + @api.dagclass + class Example: + x: Any = api.DelayedRunnable(uri="script", adapter="local", sub=None, kwargs={"prepop": {"y": api.ref("y")}}) + y: Any = 2 + + obj = Example() + assert obj.__dagclass_member_order__ == ["y", "x"] + + +def test_field_ref_to_unknown_member_fails(): + @api.dagclass + class Example: + x: Any = api.ref("missing") + + with pytest.raises(DmlRepoError, match="Unknown dagclass member reference: missing"): + Example() + + +def test_member_ref_cycle_fails(): + @api.dagclass + class Example: + x: Any = api.ref("y") + y: Any = api.ref("x") + + with pytest.raises(DmlRepoError, match="member dependency cycle"): + Example() + + +def test_method_assignment_to_unknown_member_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a): + self.missing = a + return a + + with pytest.raises(DmlRepoError, match="Unknown dagclass member assignment"): + Example() + + +def test_method_getattr_on_self_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return getattr(self, "x") # noqa: B009 + + with pytest.raises(DmlRepoError, match=r"getattr\(self, \.\.\.\)"): + Example() + + +def test_method_item_access_is_escape_hatch_not_inferred_prepop(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return self["x"].value() + + obj = Example() + assert isinstance(obj.main, api.DelayedRunnable) + assert obj.main.kwargs["prepop"] == {} + + +def test_method_unknown_member_read_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return self.missing + + with pytest.raises(DmlRepoError, match="Unknown dagclass member reference"): + Example() + + +def test_method_assignment_to_compiled_method_name_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def helper(self): + return self.x.value() + + def main(self, a): + self.helper = a + return a + + with pytest.raises(DmlRepoError, match="Cannot assign to compiled dagclass method"): + Example() + + +def test_method_setattr_on_self_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a): + name = "x" + setattr(self, name, a) + return a + + with pytest.raises(DmlRepoError, match=r"setattr\(self, \.\.\.\)"): + Example() + + +def test_method_hasattr_on_self_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return hasattr(self, "x") + + with pytest.raises(DmlRepoError, match=r"hasattr\(self, \.\.\.\)"): + Example() + + +def test_method_del_self_attr_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + del self.x + return 1 + + with pytest.raises(DmlRepoError, match="del self"): + Example() + + +def test_augmented_assignment_counts_as_dependency(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a): + self.x += a.value() + return self.x + + obj = Example() + assert obj.main.kwargs["prepop"] == {"x": api.ref("x")} + + +def test_conditional_assignment_on_one_path_keeps_dependency(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a, cond): + if cond: + self.x = a + return self.x.value() + + obj = Example() + assert obj.main.kwargs["prepop"] == {"x": api.ref("x")} + + +def test_conditional_assignment_on_all_paths_avoids_dependency(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a, b, cond): + if cond: + self.x = a + else: + self.x = b + return self.x.value() + + obj = Example() + assert obj.main.kwargs["prepop"] == {} + + +def test_method_dependency_cycle_fails(): + @api.dagclass + class Example: + def left(self): + return self.right() + + def right(self): + return self.left() + + with pytest.raises(DmlRepoError, match="dependency cycle"): + Example() + + +def test_reserved_field_name_fails(): + @api.dagclass + class Example: + dag: Any = 2 + + with pytest.raises(DmlRepoError, match="reserved names: dag"): + Example() + + +def test_reserved_method_name_fails(): + @api.dagclass + class Example: + def commit(self): + return 1 + + with pytest.raises(DmlRepoError, match="reserved names: commit"): + Example() + + +def test_nested_function_definition_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + def helper(): + return self.x + + return helper() + + with pytest.raises(DmlRepoError, match="statement type: FunctionDef"): + Example() + + +def test_lambda_capturing_self_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return (lambda: self.x)() + + with pytest.raises(DmlRepoError, match="dynamic or deferred self-capturing constructs"): + Example() + + +def test_comprehension_capturing_self_fails(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self): + return [self.x for _ in [1]] + + with pytest.raises(DmlRepoError, match="dynamic or deferred self-capturing constructs"): + Example() + + +def test_async_method_fails(): + @api.dagclass + class Example: + async def main(self): + return 1 + + with pytest.raises(DmlRepoError, match="must be a single function"): + Example() + + +def test_staticmethod_member_fails(): + @api.dagclass + class Example: + @staticmethod + def main(): + return 1 + + with pytest.raises(DmlRepoError, match="unsupported descriptor type: staticmethod"): + Example() + + +def test_explicit_funkify_method_is_not_recompiled(): + @api.dagclass + class Example: + x: Any = 2 + + @api.funkify(uri="script", adapter="local") + def main(dag): + return dag.x.value() + + obj = Example() + assert isinstance(obj.main, api.DelayedRunnable) + assert obj.main.kwargs.get("prepop") in (None, {}) diff --git a/tests/contracts/contrib/test_docker_executor_contract.py b/tests/contracts/contrib/test_docker_executor_contract.py new file mode 100644 index 0000000..3daed59 --- /dev/null +++ b/tests/contracts/contrib/test_docker_executor_contract.py @@ -0,0 +1,274 @@ +from __future__ import annotations + +import json +import os +import shutil +from typing import Any + +import boto3 +import pytest + +from daggerml._internal.exec_state import ExecutionState +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import AdapterBase, LocalAdapter +from daggerml.contrib.executors import DockerExecutor, ScriptExecutor + + +@pytest.fixture(autouse=True) +def _reset_registries(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + # Ensure docker_bin resolves on platforms without docker (tests mock all docker calls) + _orig_which = shutil.which + monkeypatch.setattr(shutil, "which", lambda n: "/usr/bin/docker" if n == "docker" else _orig_which(n)) + areg.register_adapter(LocalAdapter) + ereg.register_executor(ScriptExecutor) + ereg.register_executor(DockerExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +def _remote() -> dict[str, str]: + return {"root": os.environ["DML_REMOTE_ROOT"]} + + +def _sub_runnable() -> Runnable: + return Runnable(target=Uri("script"), adapter="dml-local-adapter", kwargs={"x": 1}, sub=None) + + +def _docker_runnable(**kwargs: Any) -> Runnable: + return Runnable(target=Uri("docker"), adapter="dml-local-adapter", kwargs=kwargs, sub=_sub_runnable()) + + +def test_local_adapter_docker_resolve_runnable_shape(): + sub = _sub_runnable() + result = LocalAdapter.resolve_runnable("docker", {"image": Uri("s3://bucket/image.tar"), "flags": ["--rm"]}, sub) + + assert isinstance(result, Runnable) + assert result.target.uri == "docker" + assert result.adapter == "dml-local-adapter" + assert result.sub is sub + assert result.kwargs == {"image": Uri("s3://bucket/image.tar"), "flags": ["--rm"]} + + +def test_local_adapter_docker_resolve_runnable_rejects_invalid_inputs(): + sub = _sub_runnable() + + with pytest.raises(DmlRepoError, match="requires sub runnable"): + LocalAdapter.resolve_runnable("docker", {"image": Uri("s3://bucket/image.tar")}, None) + + with pytest.raises(DmlRepoError, match="requires image"): + LocalAdapter.resolve_runnable("docker", {}, sub) + + +def test_docker_executor_start_launches_container_and_returns_running(monkeypatch): + """start() should run docker and return launch state (no workdir/output_path in state).""" + runnable = _docker_runnable(image="repo/name:tag", flags=["--rm"]) + docker_calls: list[tuple[Any, ...]] = [] + + monkeypatch.setattr(DockerExecutor, "_prepare_image", staticmethod(lambda *args, **_: ("repo/name:tag", None))) + monkeypatch.setattr( + DockerExecutor, + "_run_docker", + staticmethod(lambda *args, **kwargs: docker_calls.append(args) or "cid-123"), + ) + + cache_key = "ck-docker-start" + execution_id = "exec-docker-start" + argv_ptr = "s3://test-bucket/argv" + + executor = DockerExecutor() + result = executor.start( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=_remote(), + ) + + assert result["status"] == "running" + written_state = result["state"] + assert written_state["container_id"] == "cid-123" + # workdir and output_path must NOT be stored in state + assert "workdir" not in written_state + assert "output_path" not in written_state + + # Check docker run args + run_args = docker_calls[0] + assert run_args[0] == "run" + assert "dml-local-adapter" in run_args + assert "--poll" in run_args + + # input_uri and output_uri passed as S3 URIs + run_args_str = " ".join(run_args) + assert "s3://" in run_args_str + + +def test_docker_executor_start_writes_input_payload_to_s3(monkeypatch): + """start() must write the sub-adapter payload to S3 via AdapterIO.""" + runnable = _docker_runnable(image="repo/name:tag") + + monkeypatch.setattr(DockerExecutor, "_prepare_image", staticmethod(lambda *args, **_: ("repo/name:tag", None))) + monkeypatch.setattr(DockerExecutor, "_run_docker", staticmethod(lambda *args, **kwargs: "cid-payload")) + + cache_key = "ck-payload" + execution_id = "exec-payload" + remote = _remote() + + DockerExecutor().start( + runnable=runnable, + argv_ptr="s3://test-bucket/argv", + cache_key=cache_key, + execution_id=execution_id, + remote=remote, + ) + + # Verify the input payload was written to S3 via AdapterIO + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, "local:docker") + raw = exec_state._get_object_bytes(io._input_key) + assert raw is not None + payload = json.loads(raw[0]) + assert payload["cache_key"] == cache_key + assert payload["execution_id"] == execution_id + assert payload["argv_ptr"] == "s3://test-bucket/argv" + + +def test_docker_executor_poll_returns_succeeded_when_container_exited_with_s3_result(monkeypatch): + """poll() reads output from S3 via AdapterIO when container has exited.""" + import subprocess + + cache_key = "ck-poll-ok" + execution_id = "exec-ok" + remote = _remote() + + # Pre-write output to S3 + dag_id = "a" * 64 + exec_state = ExecutionState(cache_key, remote_root=remote["root"]) + io = exec_state.adapter_io(execution_id, "local:docker") + exec_state._put_object( + io._output_key, + json.dumps({"status": "succeeded", "error": None, "dag_id": dag_id}).encode(), + ) + + job_state = {"container_id": "cid-ok", "cleanup_image": None} + + def fake_run(cmd, **kwargs): + class FakeProc: + returncode = 0 + stdout = "exited\n" + stderr = "" + return FakeProc() + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = DockerExecutor().poll(cache_key=cache_key, execution_id=execution_id, state=job_state, remote=remote) + assert result["status"] == "succeeded" + assert result["dag_id"] == dag_id + + +def test_docker_executor_poll_returns_running_when_container_still_running(monkeypatch): + import subprocess + + job_state = {"container_id": "cid-running", "cleanup_image": None} + + def fake_run(cmd, **kwargs): + class FakeProc: + returncode = 0 + stdout = "running\n" + stderr = "" + return FakeProc() + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = DockerExecutor().poll( + cache_key="ck-poll-running", execution_id="exec-running", state=job_state, remote=_remote() + ) + assert result["status"] == "running" + + +def test_docker_executor_poll_returns_failed_when_no_s3_output(monkeypatch): + import subprocess + + job_state = {"container_id": "cid-no-output", "cleanup_image": None} + + def fake_run(cmd, **kwargs): + class FakeProc: + returncode = 0 + stdout = "exited\n" + stderr = "" + return FakeProc() + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = DockerExecutor().poll( + cache_key="ck-poll-no-output", execution_id="exec-no-output", state=job_state, remote=_remote() + ) + assert result["status"] == "failed" + assert "without output" in result["error"] + + +def test_docker_executor_cancel_removes_container_and_reports_cancelled(monkeypatch): + cleanup_calls = [] + monkeypatch.setattr(shutil, "which", lambda name: "/usr/bin/docker" if name == "docker" else None) + monkeypatch.setattr( + "daggerml.contrib.executors.docker._cleanup_docker", + lambda container_id, cleanup_image, docker_bin: cleanup_calls.append((container_id, cleanup_image, docker_bin)), + ) + + result = DockerExecutor().cancel( + cache_key="ck-docker-cancel", + execution_id="exec-docker-cancel", + state={"container_id": "cid-cancel", "cleanup_image": "img:tmp"}, + remote=_remote(), + ) + + assert result == {"status": "cancel-detached", "error": None} + assert cleanup_calls == [("cid-cancel", "img:tmp", "/usr/bin/docker")] + + +# --------------------------------------------------------------------------- +# AdapterBase._write_output S3 support +# --------------------------------------------------------------------------- + + +def test_write_output_writes_to_local_file(tmp_path): + out = tmp_path / "result.json" + AdapterBase._write_output(str(out), '{"status":"succeeded"}') + assert out.read_text() == '{"status":"succeeded"}' + + +def test_write_output_writes_to_s3_uri(): + bucket = "test-bucket" + key = "test-prefix/write-output-test.json" + data = '{"status":"succeeded","dag_id":"' + "a" * 64 + '"}' + AdapterBase._write_output(f"s3://{bucket}/{key}", data) + + s3 = boto3.client("s3") + body = s3.get_object(Bucket=bucket, Key=key)["Body"].read() + assert body == data.encode("utf-8") + + +def test_write_output_s3_content_type_is_json(): + bucket = "test-bucket" + key = "test-prefix/write-output-ct.json" + AdapterBase._write_output(f"s3://{bucket}/{key}", '{}') + + head = boto3.client("s3").head_object(Bucket=bucket, Key=key) + assert head["ContentType"] == "application/json" + + +def test_write_output_stdout(capsys): + AdapterBase._write_output("-", '{"status":"running"}') + captured = capsys.readouterr() + assert '{"status":"running"}' in captured.out + + +def test_write_output_stdout_appends_newline_if_missing(capsys): + AdapterBase._write_output("-", "no-newline") + captured = capsys.readouterr() + assert captured.out.endswith("\n") diff --git a/tests/contracts/contrib/test_executor_registry_contract.py b/tests/contracts/contrib/test_executor_registry_contract.py new file mode 100644 index 0000000..64e06d0 --- /dev/null +++ b/tests/contracts/contrib/test_executor_registry_contract.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from daggerml import Runnable, Uri +from daggerml._internal.types import DmlRepoError +from daggerml.contrib import executor_registry as reg + + +class ExecutorSpec: + def __init__(self, name: str, adapter: str): + self.name = name + self.adapter = adapter + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @staticmethod + def start(*, runnable, argv_ptr, cache_key, execution_id, remote, state=None): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def poll(*, state=None, cache_key=None, execution_id=None, remote=None): + return {"status": "running", "error": None, "state": state or {}} + + @staticmethod + def cleanup(*, state=None): + return None + + +@pytest.fixture(autouse=True) +def _reset_registry(): + reg._reset_for_tests() + yield + reg._reset_for_tests() + + +def test_register_get_and_list_executor(): + reg.register_executor(ExecutorSpec("custom", "local")) + loaded = reg.get_executor("local", "custom") + assert loaded.name == "custom" + assert loaded.adapter == "local" + assert reg.list_executors("local") == ["cfn", "custom", "docker", "script", "ssh"] + + +def test_register_executor_accepts_class_object(): + class CustomExecutor: + name = "custom" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @staticmethod + def start(*, runnable, argv_ptr, cache_key, execution_id, remote, state=None): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def poll(*, state=None, cache_key=None, execution_id=None, remote=None): + return {"status": "running", "error": None, "state": state or {}} + + @staticmethod + def cleanup(*, state=None): + return None + + reg.register_executor(CustomExecutor) + loaded = reg.get_executor("local", "custom") + assert loaded is CustomExecutor + + +def test_register_executor_missing_required_attribute_fails(): + class BadExecutor: + name = "bad" + + with pytest.raises(DmlRepoError, match="missing required attribute: adapter"): + reg.register_executor(BadExecutor) + + +def test_register_executor_missing_required_lifecycle_callable_fails(): + class MissingStartExecutor: + name = "missing-start" + adapter = "local" + + @staticmethod + def poll(*, state): + return {"status": "running", "error": None, "state": state} + + with pytest.raises(DmlRepoError, match="missing required callables: start, cleanup"): + reg.register_executor(MissingStartExecutor) + + +def test_get_unknown_executor_fails(): + with pytest.raises(DmlRepoError, match="Executor 'missing' is not registered for adapter 'local'"): + reg.get_executor("local", "missing") + + +class _FakeEntryPoint: + def __init__(self, name: str, value: str, loaded): + self.name = name + self.value = value + self._loaded = loaded + + def load(self): + return self._loaded + + +def test_plugin_loading_contract_variants(monkeypatch): + def _factory(): + return ExecutorSpec("docker", "local") + + monkeypatch.setattr( + reg, + "_entry_points", + lambda: [ + _FakeEntryPoint("a", "mod:a", ExecutorSpec("script", "local")), + _FakeEntryPoint("b", "mod:b", [ExecutorSpec("batch", "lambda"), _factory]), + ], + ) + + assert reg.list_executors() == ["batch", "docker", "script"] + assert reg.list_executors("local") == ["docker", "script"] + assert reg.list_executors("lambda") == ["batch"] + + +def test_plugin_loading_invalid_return_fails(monkeypatch): + monkeypatch.setattr(reg, "_entry_points", lambda: [_FakeEntryPoint("bad", "mod:bad", object())]) + + with pytest.raises(DmlRepoError, match="returned invalid executor registration"): + reg.load_executor_plugins() + + +def test_pyproject_declares_builtin_executor_entry_points(): + pyproject = (Path(__file__).resolve().parents[3] / "pyproject.toml").read_text() + + assert '[project.entry-points."daggerml.contrib.executors"]' in pyproject + assert 'batch = "daggerml.contrib.executors:BatchExecutor"' in pyproject + assert 'script = "daggerml.contrib.executors:ScriptExecutor"' in pyproject diff --git a/tests/contracts/contrib/test_lambda_executor_base_contract.py b/tests/contracts/contrib/test_lambda_executor_base_contract.py new file mode 100644 index 0000000..272b040 --- /dev/null +++ b/tests/contracts/contrib/test_lambda_executor_base_contract.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from typing import Any, ClassVar + +import pytest + +from daggerml import Runnable, Uri +from daggerml.contrib.executors._lambda import LambdaExecutorBase + +_REMOTE = {"root": "s3://bucket/root"} + + +class _Executor(LambdaExecutorBase): + name = "lambda-test" + + start_calls: ClassVar[list[dict[str, Any]]] = [] + + def start(self, *, cache_key, execution_id, runnable, argv_ptr, remote): + _Executor.start_calls.append( + { + "runnable": runnable, + "argv_ptr": argv_ptr, + "cache_key": cache_key, + "execution_id": execution_id, + "remote": remote, + } + ) + return {"status": "running", "error": None, "state": {"token": execution_id}} + + def poll(self, *, cache_key, execution_id, state, remote): + return {"status": "succeeded", "error": None, "dag_id": "d" * 64} + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-lambda-adapter") + + +class _FailingStartExecutor(LambdaExecutorBase): + name = "lambda-test-failing-start" + + def start(self, *, cache_key, execution_id, runnable, argv_ptr, remote): + raise RuntimeError("boom") + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-lambda-adapter") + + +@pytest.fixture(autouse=True) +def _reset(): + _Executor.start_calls = [] + + +def _payload(*, cache_key: str) -> dict[str, Any]: + runnable = Runnable(target=Uri("lambda-test"), kwargs={"x": 1}, adapter="dml-lambda-adapter") + return { + "runnable": runnable, + "argv_ptr": "argv://ptr", + "cache_key": cache_key, + "execution_id": f"exec-{cache_key}", + "remote": _REMOTE, + "state": None, + } + + +def test_lambda_executor_handler_starts_when_no_job_state(monkeypatch): + cache_key = "lambda-start" + result = _Executor.handler(_payload(cache_key=cache_key), None) + + assert result["status"] == "running" + assert len(_Executor.start_calls) == 1 + + +def test_lambda_executor_handler_polls_with_existing_job_state(monkeypatch): + job_state = {"some": "state"} + cache_key = "lambda-poll" + payload = _payload(cache_key=cache_key) + payload["state"] = job_state + result = _Executor.handler(payload, None) + + assert result["status"] == "succeeded" + assert len(_Executor.start_calls) == 0 + + +def test_lambda_executor_handler_returns_failed_on_exception(monkeypatch): + cache_key = "lambda-handler-failure" + result = _FailingStartExecutor.handler(_payload(cache_key=cache_key), None) + + assert result["status"] == "failed" + assert "boom" in result["error"] diff --git a/tests/contracts/contrib/test_ref_contract.py b/tests/contracts/contrib/test_ref_contract.py new file mode 100644 index 0000000..9e3990c --- /dev/null +++ b/tests/contracts/contrib/test_ref_contract.py @@ -0,0 +1,88 @@ +import pytest + +from daggerml import codecs, new +from daggerml._internal.types import DmlRepoError +from daggerml.contrib import api +from tests import temporary_dml + + +def test_ref_returns_delayed_ref(): + value = api.ref("x") + assert isinstance(value, api.DelayedRef) + assert value.name == "x" + + +def test_delayed_action_codec_matches_delayed_ref(): + codec = codecs.DelayedActionCodec() + assert codec.can_encode(api.DelayedRef("x")) + assert codec.can_encode(api.DelayedLoad("d0")) + assert not codec.can_encode("x") + + +def test_ref_resolves_when_staged(): + with temporary_dml() as dml: + dag = new(dml=dml, name="d0", message="d0") + dag.a = 42 + out = dag.put(api.ref("a")) + assert out.value() == 42 + + +def test_ref_resolves_in_nested_values(): + with temporary_dml() as dml: + dag = new(dml=dml, name="d0", message="d0") + dag.a = 7 + out = dag.put({"v": [api.ref("a")]}) + assert out.value() == {"v": [7]} + + +def test_ref_missing_name_fails(): + with temporary_dml() as dml: + dag = new(dml=dml, name="d0", message="d0") + with pytest.raises(DmlRepoError, match="Node 'missing' not found in DAG"): + dag.put(api.ref("missing")) + + +def test_load_returns_delayed_load(): + value = api.load("d0") + assert isinstance(value, api.DelayedLoad) + assert value.dagname == "d0" + assert value.nodename is None + + +def test_load_resolves_result_node_when_nodename_none(): + with temporary_dml() as dml: + src = new(dml=dml, name="src", message="src") + src.result_named = 123 + src.commit(999) + + dst = new(dml=dml, name="dst", message="dst") + out = dst.put(api.load("src")) + assert out.value() == 999 + + +def test_load_resolves_named_node_when_nodename_set(): + with temporary_dml() as dml: + src = new(dml=dml, name="src", message="src") + src.result_named = 123 + src.commit(999) + + dst = new(dml=dml, name="dst", message="dst") + out = dst.put(api.load("src", "result_named")) + assert out.value() == 123 + + +def test_load_missing_dag_fails(): + with temporary_dml() as dml: + dag = new(dml=dml, name="d0", message="d0") + with pytest.raises(DmlRepoError, match="DAG 'missing' not found"): + dag.put(api.load("missing")) + + +def test_load_missing_node_fails(): + with temporary_dml() as dml: + src = new(dml=dml, name="src", message="src") + src.commit(1) + + dst = new(dml=dml, name="dst", message="dst") + with pytest.raises(DmlRepoError, match="Node 'missing' not found in DAG 'src'"): + dst.put(api.load("src", "missing")) diff --git a/tests/contracts/contrib/test_run_contract.py b/tests/contracts/contrib/test_run_contract.py new file mode 100644 index 0000000..0817caa --- /dev/null +++ b/tests/contracts/contrib/test_run_contract.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import pytest + +from daggerml import clear_default_dml, load, set_default_dml +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import api +from tests import temporary_dml + + +@pytest.fixture(autouse=True) +def _runtime_setup(): + areg._reset_for_tests() + + @dataclass + class TestAdapter: + name: str = "test" + executable: str = "test-adapter" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=kwargs, sub=sub, adapter="") + + @staticmethod + def send(*, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + areg.register_adapter(TestAdapter()) + with temporary_dml() as dml: + set_default_dml(dml) + yield dml + clear_default_dml() + areg._reset_for_tests() + + +def test_run_executes_entrypoint_and_returns_none(_runtime_setup): + @api.dagclass + class RunExample: + x: Any = 7 + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + result = api.run(RunExample(), 1, 2, name="run-example") + assert result is None + loaded = load("run-example", dml=_runtime_setup) + assert "x" in loaded.keys() + assert loaded["x"].value() == 7 + assert "main" in loaded.keys() + assert "" in loaded.keys() + assert loaded[""].value() == [1, 2] + assert loaded.result.value() == [1, 2] + + +def test_run_materializes_additional_delayed_runnable_members_by_name(_runtime_setup): + @api.dagclass + class RunExample: + x: Any = 3 + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + alt: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + result = api.run(RunExample(), 9, name="run-funkified") + assert result is None + loaded = load("run-funkified", dml=_runtime_setup) + assert "x" in loaded.keys() + assert loaded["x"].value() == 3 + assert "main" in loaded.keys() + assert "alt" in loaded.keys() + assert loaded[""].value() == [9] + + +def test_run_materializes_same_namespace_refs_in_dependency_order(_runtime_setup): + @api.dagclass + class RunExample: + x: Any = api.ref("y") + y: Any = 3 + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + result = api.run(RunExample(), name="run-ref-order") + assert result is None + loaded = load("run-ref-order", dml=_runtime_setup) + assert loaded["y"].value() == 3 + assert loaded["x"].value() == 3 + + +def test_run_entrypoint_override(): + @api.dagclass + class RunExample: + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + alt: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + result = api.run(RunExample(), 9, entrypoint="alt", name="run-alt") + assert result is None + + +def test_run_rejects_non_dagclass_instance(): + with pytest.raises(DmlRepoError, match="not a dagclass instance"): + api.run(object()) + + +def test_run_rejects_missing_entrypoint(): + @api.dagclass(entrypoint="missing") + class RunExample: + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + with pytest.raises(DmlRepoError, match="entrypoint not found"): + api.run(RunExample()) + + +def test_run_rejects_non_delayed_runnable_entrypoint(): + @api.dagclass + class RunExample: + main: Any = 1 + + with pytest.raises(DmlRepoError, match="entrypoint must be DelayedRunnable"): + api.run(RunExample()) + + +def test_run_rejects_uncompiled_instance(): + @api.dagclass + class RunExample: + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + obj = RunExample() + obj.__dagclass_compiled__ = False + with pytest.raises(DmlRepoError, match="instance is not compiled"): + api.run(obj) + + +def test_run_default_name_format_contains_class_separator(): + @api.dagclass + class RunExample: + main: Any = api.DelayedRunnable(uri="daggerml:list", adapter="test", sub=None, kwargs={}) + + result = api.run(RunExample(), 1) + assert result is None + # name format contract includes :: + # ensure DAG can be loaded by discovered name from the same instance class + default_name = api._default_run_name(RunExample()) + assert "::RunExample" in default_name diff --git a/tests/contracts/contrib/test_status_contract.py b/tests/contracts/contrib/test_status_contract.py new file mode 100644 index 0000000..8ed62d5 --- /dev/null +++ b/tests/contracts/contrib/test_status_contract.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +from typing import Any, cast + +import pytest + +import daggerml.codecs as codec_mod +from daggerml import Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib import status as cstatus + + +class _FakeEntryPoint: + def __init__(self, group: str, name: str, value: str, loaded): + self.group = group + self.name = name + self.value = value + self._loaded = loaded + + def load(self): + if isinstance(self._loaded, Exception): + raise self._loaded + return self._loaded + + +@pytest.fixture(autouse=True) +def _reset_state(): + areg._reset_for_tests() + ereg._reset_for_tests() + with codec_mod._lock: + old_codecs = list(codec_mod._literal_codecs) + old_seq = codec_mod._literal_codec_seq + old_loaded = codec_mod._plugins_loaded + codec_mod._literal_codecs = [] + codec_mod._literal_codec_seq = 0 + codec_mod._plugins_loaded = False + yield + areg._reset_for_tests() + ereg._reset_for_tests() + with codec_mod._lock: + codec_mod._literal_codecs = old_codecs + codec_mod._literal_codec_seq = old_seq + codec_mod._plugins_loaded = old_loaded + + +def test_status_reports_runtime_registrations(monkeypatch): + monkeypatch.setattr(areg, "_entry_points", lambda: []) + monkeypatch.setattr(ereg, "_entry_points", lambda: []) + monkeypatch.setattr(codec_mod, "_entry_points", lambda: []) + + class CustomAdapter: + name = "custom" + executable = "custom-exec" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return (uri, kwargs, sub) + + @staticmethod + def send(*, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + class CustomExecutor: + name = "custom" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @staticmethod + def start(*, runnable, argv_ptr, cache_key, execution_id, remote, state=None): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def poll(*, state=None, cache_key=None, execution_id=None, remote=None): + return {"status": "running", "error": None, "state": state or {}} + + @staticmethod + def cleanup(*, state=None): + return None + + class CustomCodec: + def can_encode(self, value): + return False + + def encode(self, value, ctx): + return value + + areg.register_adapter(CustomAdapter()) + ereg.register_executor(CustomExecutor) + codec_mod.register_codec(CustomCodec(), priority=7) + + result = cast(dict[str, Any], cstatus.status()) + + assert result["schema_version"] == 0 + assert result["summary"] == { + "has_errors": False, + "diagnostic_count": 0, + "adapter_registration_count": 1, + "adapter_effective_count": 1, + "executor_registration_count": 1, + "executor_effective_count": 1, + "codec_registration_count": 1, + "codec_effective_count": 1, + } + + adapter = result["adapters"][0] + assert adapter["key"] == "custom" + assert adapter["fqn"].endswith("CustomAdapter") + assert adapter["effective"] is True + assert adapter["implements"] == {"resolve_runnable": True, "send": True, "cli": True} + assert set(adapter.keys()) == {"key", "fqn", "effective", "implements"} + + executor = result["executors"][0] + assert executor["key"] == "local:custom" + assert executor["fqn"].endswith("CustomExecutor") + assert executor["implements"]["start"] is True + assert executor["implements"]["cleanup"] is True + + codec = result["codecs"][0] + assert codec["fqn"].endswith("CustomCodec") + assert codec["effective"] is True + + assert result["diagnostics"] == [] + + +def test_status_reports_best_effort_plugin_failures(monkeypatch): + class PluginAdapter: + name = "plugin" + executable = "plugin-exec" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return (uri, kwargs, sub) + + @staticmethod + def send(*, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + class PluginExecutor: + name = "script" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @staticmethod + def start(*, runnable, argv_ptr, cache_key, execution_id, remote, state=None): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def poll(*, state=None, cache_key=None, execution_id=None, remote=None): + return {"status": "running", "error": None, "state": state or {}} + + @staticmethod + def cleanup(*, state=None): + return None + + class PluginCodec: + def can_encode(self, value): + return False + + def encode(self, value, ctx): + return value + + monkeypatch.setattr( + areg, + "_entry_points", + lambda: [ + _FakeEntryPoint(areg.ADAPTER_ENTRYPOINT_GROUP, "bad", "pkg.bad:adapter", RuntimeError("nope")), + _FakeEntryPoint(areg.ADAPTER_ENTRYPOINT_GROUP, "plugin", "pkg.good:adapter", PluginAdapter), + ], + ) + monkeypatch.setattr( + ereg, + "_entry_points", + lambda: [_FakeEntryPoint(ereg.EXECUTOR_ENTRYPOINT_GROUP, "script", "pkg.exec:executor", PluginExecutor)], + ) + monkeypatch.setattr( + codec_mod, + "_entry_points", + lambda: [ + _FakeEntryPoint( + codec_mod.LITERAL_CODEC_ENTRYPOINT_GROUP, "codec", "pkg.codec:factory", lambda: (PluginCodec(), 5) + ) + ], + ) + + result = cast(dict[str, Any], cstatus.status()) + + assert result["summary"]["has_errors"] is True + assert result["summary"]["adapter_registration_count"] == 1 + assert result["summary"]["adapter_effective_count"] == 1 + assert result["summary"]["executor_registration_count"] == 1 + assert result["summary"]["codec_registration_count"] == 1 + + assert result["adapters"][0]["key"] == "plugin" + assert result["adapters"][0]["fqn"].endswith("PluginAdapter") + assert result["executors"][0]["key"] == "local:script" + assert result["executors"][0]["fqn"].endswith("PluginExecutor") + assert result["codecs"][0]["key"].startswith("5:0:") + assert result["codecs"][0]["key"].endswith("PluginCodec") + assert result["codecs"][0]["fqn"].endswith("PluginCodec") + assert [item["code"] for item in result["diagnostics"]] == ["entry_point_load_failed"] + + +def test_status_loads_codec_plugins_into_runtime_registry(monkeypatch): + class PluginCodec: + def can_encode(self, value): + return False + + def encode(self, value, ctx): + return value + + monkeypatch.setattr(areg, "_entry_points", lambda: []) + monkeypatch.setattr(ereg, "_entry_points", lambda: []) + monkeypatch.setattr( + codec_mod, + "_entry_points", + lambda: [ + _FakeEntryPoint( + codec_mod.LITERAL_CODEC_ENTRYPOINT_GROUP, "codec", "pkg.codec:factory", lambda: (PluginCodec(), 5) + ) + ], + ) + + result = cast(dict[str, Any], cstatus.status()) + + assert result["summary"]["codec_registration_count"] == 1 + assert result["codecs"][0]["fqn"].endswith("PluginCodec") + assert codec_mod._plugins_loaded is True + assert len(codec_mod._literal_codecs) == 1 + + +def test_status_reports_codec_loader_errors(monkeypatch): + monkeypatch.setattr(areg, "_entry_points", lambda: []) + monkeypatch.setattr(ereg, "_entry_points", lambda: []) + monkeypatch.setattr( + codec_mod, + "_entry_points", + lambda: [ + _FakeEntryPoint( + codec_mod.LITERAL_CODEC_ENTRYPOINT_GROUP, + "codec", + "pkg.codec:factory", + RuntimeError("codec boom"), + ) + ], + ) + monkeypatch.setattr( + codec_mod, + "ensure_literal_codec_plugins_loaded", + lambda: (_ for _ in ()).throw( + RuntimeError("Literal codec plugin 'codec (pkg.codec:factory)' failed: codec boom") + ), + ) + + result = cast(dict[str, Any], cstatus.status()) + + assert result["summary"]["has_errors"] is True + assert result["summary"]["codec_registration_count"] == 0 + assert result["diagnostics"] == [ + { + "severity": "error", + "scope": "codec", + "code": "entry_point_load_failed", + "message": "Literal codec plugin 'codec (pkg.codec:factory)' failed: codec boom", + "source": { + "kind": "entry_point", + "group": codec_mod.LITERAL_CODEC_ENTRYPOINT_GROUP, + "name": "codec", + "value": "pkg.codec:factory", + }, + "key": None, + } + ] diff --git a/tests/contracts/contrib/test_testing_contract.py b/tests/contracts/contrib/test_testing_contract.py new file mode 100644 index 0000000..a39bf1b --- /dev/null +++ b/tests/contracts/contrib/test_testing_contract.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import os + +from daggerml import Uri +from daggerml.contrib import api +from daggerml.contrib.testing import MockNode, defunkify + + +def test_value_node_returns_wrapped_scalar(): + node = MockNode(7) + + assert node.value() == 7 + + +def test_value_node_returns_wrapped_uri(): + uri = Uri("s3://bucket/path") + node = MockNode(uri) + + assert node.value() is uri + + +def test_value_node_supports_funkify_style_author_code(): + def fn(dag, x, y): + return x.value() + y.value() + + assert fn(None, MockNode(2), MockNode(5)) == 7 + + +def test_mock_node_alias_matches_value_node_behavior(): + node = MockNode(7) + + assert isinstance(node, MockNode) + assert node.value() == 7 + + +def test_defunkify_wraps_plain_args_and_kwargs(): + @api.funkify + def fn(dag, x, *, y): + return x.value() + y.value() + + call = defunkify(fn) + + assert call(None, 2, y=5) == 7 + + +def test_defunkify_unwraps_nested_delayed_runnable(): + @api.funkify(uri="docker", image="repo/name") + @api.funkify + def fn(dag, x, y): + return x.value() + y.value() + + call = defunkify(fn) + + assert call(None, MockNode(2), 5) == 7 + + +def test_defunkify_runs_in_isolated_workdir(tmp_path): + touched = [] + + @api.funkify + def fn(dag): + touched.append(os.getcwd()) + open("artifact.txt", "w").write("ok") + return 1 + + cwd = os.getcwd() + call = defunkify(fn) + + assert call(None) == 1 + assert os.getcwd() == cwd + assert touched[0] != cwd + assert not (tmp_path / "artifact.txt").exists() + assert not os.path.exists("artifact.txt") diff --git a/tests/contracts/core/test_core_contract.py b/tests/contracts/core/test_core_contract.py new file mode 100644 index 0000000..a963938 --- /dev/null +++ b/tests/contracts/core/test_core_contract.py @@ -0,0 +1,271 @@ +import os +from pathlib import Path +from typing import cast +from unittest import TestCase + +import pytest + +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.api import Dag, DictNode, Error, ListNode, Node, load, new +from tests import temporary_dml + +SUM_URI = "./tests/assets/fns/sum.py" +ASYNC_URI = "./tests/assets/fns/async.py" +TIMEOUT_URI = "./tests/assets/fns/timeout.py" +TEST_FN_DIR = Path(__file__).resolve().parents[2] / "assets" / "internal_fn" +FN_ADAPTER = str(TEST_FN_DIR / "python-fork-adapter.py") + + +class TestSetAttrs: + def _mk_runnable(self, dml, uri: str, adapter: str, defaults: dict | None = None) -> Runnable: + return Runnable(target=Uri(uri), kwargs=defaults or {}, adapter=adapter) + + @pytest.mark.parametrize("x", [[0], (0,), [], ["asdf", None]]) # none contain 1 + def test_list_attrs(self, x, dml): + dag = new(dml=dml, name="d0", message="d0") + n0 = dag.put(x) + assert n0.contains(1).value() is False + assert 1 not in n0 + assert len(n0) == len(x) + for index, item_node in enumerate(n0): + item = x[index] + assert item_node.value() == item + assert n0.contains(item).value() is True + assert item in n0 + assert n0[index].value() == item + assert n0.append(1).value() == [*x, 1] + assert n0.conj(1).value() == [*x, 1] + + @pytest.mark.parametrize("x", [{}, {"a": 1}, {"x": 42, "y": {"k0": None}}]) # none contain 'z' + def test_dict_attrs(self, x, dml): + print(f"Testing dict attrs with x={x} and dml={dml}") + dag = new(dml=dml, name="d0", message="d0") + n0 = dag.put(x) + assert n0.contains("z").value() is False + assert "z" not in n0 + assert len(n0) == len(x) + assert n0.get("z", default=123).value() == 123 + for key in n0: + item = x[key] + assert n0[key].value() == item + assert n0.contains(key).value() is True + assert key in n0 + assert n0.get(key).value() == item + assert [(k, v.value()) for k, v in n0.items()] == list(x.items()) + assert n0.keys() == list(x.keys()) + assert [x.value() for x in n0.values()] == list(x.values()) + assert n0.assoc("y", 3).value() == {**x, "y": 3} + assert n0.update({"z": 1, "a": 2}).value() == {**x, "z": 1, "a": 2} + + def test_load_reboot(self, dml): + with new(dml=dml, name="d0", message="d0") as dag: + dag.put(42, name="n0") + dag.commit("foo") + with new(dml=dml, name="d1", message="d1") as dag: + node = dag.put(load("d0", dml=dml).result, name="n1") + assert node.dag == dag + assert node.value() == "foo" + assert node.load()["n0"].value() == 42 + assert dag.put(load("d0", dml=dml)["n0"]).value() == 42 + + def test_put_node_from_other_dag_auto_imports(self, dml): + with new(dml=dml, name="src", message="src") as src: + src.put(99, name="n0") + src.commit(src.n0) + + foreign_node = load("src", dml=dml)["n0"] + with new(dml=dml, name="dst", message="dst") as dst: + imported = dst.put(foreign_node, name="imported") + assert imported.value() == 99 + dst.commit(imported) + + def test_node_call_w_literal_deps(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = self._mk_runnable(dml, SUM_URI, FN_ADAPTER, defaults={"x": 10}) + result = dag.call(fn, *nums) + assert result.value() == sum(nums) + assert "x" in result.load().keys() + assert result.load()["x"].value() == 10 + + def test_node_call_w_node_deps(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = self._mk_runnable(dml, SUM_URI, FN_ADAPTER, defaults={"x": dag.put(10)}) + result = dag.call(fn, *nums) + assert result.value() == sum(nums) + assert "x" in result.load().keys() + assert result.load()["x"].value() == 10 + + def test_node_call_w_kwarg(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = self._mk_runnable(dml, SUM_URI, FN_ADAPTER, defaults={"x": 10}) + result = dag.call(fn, *nums, x=100) + assert result.value() == sum(nums) + assert "x" in result.load().keys() + assert result.load()["x"].value() == 100 + + def test_bad_kwarg(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = self._mk_runnable(dml, SUM_URI, FN_ADAPTER, defaults={"x": 10}) + with pytest.raises(DmlRepoError, match=r"Unknown kwarg: y"): + dag.call(fn, *nums, y=100) + + def test_node_call(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = dag.put(self._mk_runnable(dml, SUM_URI, FN_ADAPTER)) + result = fn(*nums) + assert result.value() == sum(nums) + + def test_node_call_runnable(self, dml): + nums = [1, 2, 3] + dag = new(dml=dml, name="d0", message="d0") + fn = self._mk_runnable(dml, SUM_URI, FN_ADAPTER) + result = dag.call(fn, *nums) + assert result.value() == sum(nums) + + def test_load_recursing(self, dml): + nums = [1, 2, 3] + with new(dml=dml, name="d0", message="d0") as dag: + dag.commit(dag.call(self._mk_runnable(dml, SUM_URI, FN_ADAPTER), *nums, name="n1")) + d1 = new(dml=dml, name="d1", message="d1") + n1 = d1.put(load("d0", dml=dml)["n1"], name="n1_1") + assert n1.dag == d1 + n2 = n1.load()["n1"].load()["num_args"] + assert n2.value() == len(nums) + assert n1.value() == sum(nums) + + def test_no_caching(self): + nums = [1, 2, 3] + with temporary_dml() as dml: + with new(dml=dml, name="d0", message="d0") as d1: + n1 = d1.call(self._mk_runnable(dml, SUM_URI, FN_ADAPTER), *nums) + uid = n1.load()["uuid"].value() + with temporary_dml() as dml: + with new(dml=dml, name="d1", message="d0") as d1: + n1 = d1.call(self._mk_runnable(dml, SUM_URI, FN_ADAPTER), *nums) + uid1 = n1.load()["uuid"].value() + assert uid == uid1, "Cached dag should have the same UUID" + + def test_nodemap(self, dml): + dag = new(dml=dml, name="d0", message="d0") + dag.a = 23 + node = dag.put(42, name="b") + other = dag.put(420) + assert dag["a"].value() == 23 + assert list(dag) == ["a", "b"] + dag.commit([node, other]) + + def test_set_attrs(self, dml): + dag = new(dml=dml, name="d0", message="d0") + with pytest.raises(DmlRepoError, match="Set literals are not supported"): + dag.put({0}) + + def test_load_constructors(self, dml): + dag = new(dml=dml, name="d0", message="d0") + l0 = dag.put(42) + c0 = dag.put({"a": 1, "b": [l0, "23"]}) + assert c0["b"][0] != l0 + with pytest.raises(NotImplementedError, match="temporarily disabled"): + c0.backtrack("b", 0) + + def test_fn_ok_cache(self, dml): + with new(dml=dml, name="d0", message="d0") as dag: + nodes = [dag.call(self._mk_runnable(dml, SUM_URI, FN_ADAPTER), i, 1, 2) for i in range(2)] + # Add a repeat outside so `nodes` remains unique. + dag.call(self._mk_runnable(dml, SUM_URI, FN_ADAPTER), 0, 1, 2) + dag.commit(nodes[0]) + assert dag.result.value() == 3 + + def test_async_fn_ok(self, dml): + debug_file = os.path.join(dml._context.project_home, "debug") + with new(dml=dml, name="d0", message="d0") as dag: + n1 = dag.call(self._mk_runnable(dml, ASYNC_URI, FN_ADAPTER), 1, 2, 3) + dag.commit(n1) + assert n1.value() == 6 + with open(debug_file, "r") as f: + assert len([1 for _ in f]) == 2 + + def test_async_fn_error(self, dml): + with pytest.raises(Error, match=r".*unsupported operand type.*"): + with new(dml=dml, name="d0", message="d0") as dag: + dag.call(self._mk_runnable(dml, ASYNC_URI, FN_ADAPTER), 1, 2, "asdf") + assert dml.dag.list()["dags"]["d0"] is not None + + def test_async_fn_timeout(self, dml): + with pytest.raises(TimeoutError): + with new(dml=dml, name="d0", message="d0") as dag: + dag.call(self._mk_runnable(dml, TIMEOUT_URI, FN_ADAPTER), 1, 2, 3, timeout=1000) + + def test_load(self, dml): + with new(dml=dml, name="d0", message="d0") as dag: + dag.put(42, name="n0") + dag.commit("foo") + dl = load("d0", dml=dml) + assert isinstance(dl, Dag) + assert dl["n0"].value() == 42 + assert dl.result.value() == "foo" + + def test_put_node_uses_node_codec(self, dml): + dag = new(dml=dml, name="d0", message="d0") + original = dag.put(42, name="n0") + alias = dag.put(original, name="n1") + assert alias.ref == original.ref + assert dag["n1"].value() == 42 + + +class TestBasic(TestCase): + def test_dag_named_node_access_roundtrip(self): + with temporary_dml() as dml: + d0 = new(dml=dml, name="d0", message="d0") + self.assertIsInstance(d0, Dag) + n0 = d0.put([42], name="n0") + self.assertIsInstance(n0, Node) + self.assertEqual(n0.value(), [42]) + assert len(d0) == 1 + self.assertEqual(len(n0), 1) + self.assertEqual(n0.type, "list") + d0["x0"] = n0 + self.assertEqual(d0["x0"], n0) + self.assertEqual(d0.x0, n0) + d0.x1 = 42 + self.assertEqual(d0["x1"].value(), 42) + self.assertEqual(d0.x1.value(), 42) + + def test_dag_collection_materialization_roundtrip(self): + with temporary_dml() as dml: + d0 = new(dml=dml, name="d0", message="d0") + n0 = d0.put([42], name="n0") + d0.x2 = 99 + self.assertEqual(d0.x2.value(), 99) + d0.x3 = 100 + self.assertEqual(d0.x3.value(), 100) + d0.n1 = n0[0] + self.assertIsInstance(n0[0], Node) + self.assertEqual([x.value() for x in n0], [d0.n1.value()]) + self.assertEqual(d0.n1.value(), 42) + d0.n2 = {"x": n0, "y": "z"} + n2 = cast(DictNode, d0.n2) + self.assertNotEqual(n2["x"], n0) + self.assertEqual(n2["x"].value(), n0.value()) + d0.n3 = list(n2.items()) + self.assertIsInstance([x for x in d0.n3], list) + self.assertDictEqual( + {k: v.value() for k, v in n2.items()}, + {"x": n0.value(), "y": "z"}, + ) + d0.n4 = [1, 2, 3, 4, 5] + d0.n5 = cast(ListNode, d0.n4)[1:] + self.assertListEqual([x.value() for x in d0.n5], [2, 3, 4, 5]) + + def test_dag_commit_result_and_delete_then_gc(self): + with temporary_dml() as dml: + d0 = new(dml=dml, name="d0", message="d0") + n0 = d0.put([42], name="n0") + d0.commit(n0) + self.assertEqual(dml.dag.get("d0")["dag"]["result"], n0.ref) + dml.dag.delete("d0", user=dml._context.user or "dml") + dml.admin.gc() diff --git a/tests/contracts/internal/cli/test_method_cli_contract.py b/tests/contracts/internal/cli/test_method_cli_contract.py new file mode 100644 index 0000000..a206472 --- /dev/null +++ b/tests/contracts/internal/cli/test_method_cli_contract.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import json +from io import StringIO +from unittest.mock import patch + +import pytest + +from daggerml._cli import MethodCLI + + +class _NamespaceExampleNamespace: + def __init__(self, project_home: str): + self.project_home = project_home + + def render(self, payload: list[int], enabled: bool = False): + return {"project_home": self.project_home, "payload": payload, "enabled": enabled} + + +class _NamespaceExample: + def __init__(self, project_home: str): + self.project_home = project_home + + @property + def namespace(self) -> _NamespaceExampleNamespace: + return _NamespaceExampleNamespace(self.project_home) + + +def test_method_cli_calls_root_classmethod_without_instantiating_root(): + class Example: + def __init__(self): + raise AssertionError("root should not be instantiated") + + @classmethod + def init(cls, value: int = 1): + return {"kind": cls.__name__, "value": value} + + cli = MethodCLI(Example, prog="example") + + with patch("sys.stdout", new_callable=StringIO) as stdout: + assert cli.run(["init", "--value", "7"]) == 0 + + assert json.loads(stdout.getvalue()) == {"kind": "Example", "value": 7} + + +def test_method_cli_resolves_constructor_args_namespaces_and_json_inputs(): + cli = MethodCLI(_NamespaceExample, prog="example") + + with patch("sys.stdout", new_callable=StringIO) as stdout: + assert cli.run(["--project-home", "/tmp/repo", "namespace", "render", "[1, 2]", "--enabled"]) == 0 + + assert json.loads(stdout.getvalue()) == {"enabled": True, "payload": [1, 2], "project_home": "/tmp/repo"} + + +def test_method_cli_only_exposes_classmethods_on_root_class(): + class Namespace: + @classmethod + def init(cls): + return {"kind": cls.__name__} + + class Example: + def __init__(self): + pass + + @property + def namespace(self) -> Namespace: + return Namespace() + + cli = MethodCLI(Example, prog="example") + + with pytest.raises(SystemExit): + cli.parser.parse_args(["namespace", "init"]) + + +def test_method_cli_main_reports_exceptions_to_stderr(): + class Example: + def __init__(self): + pass + + def explode(self): + raise RuntimeError("boom") + + cli = MethodCLI(Example, prog="example") + + with patch("sys.stderr", new_callable=StringIO) as stderr: + assert cli.main(["explode"]) == 1 + + assert "error: boom" in stderr.getvalue() diff --git a/tests/contracts/internal/cli/test_porcelain_contract.py b/tests/contracts/internal/cli/test_porcelain_contract.py new file mode 100644 index 0000000..987754e --- /dev/null +++ b/tests/contracts/internal/cli/test_porcelain_contract.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from daggerml.api import new +from tests import temporary_dml + + +def test_show_log_and_diff_return_spec_shapes(): + with temporary_dml() as dml: + with new(dml=dml, name="baseline", message="baseline") as dag: + result = dag.put(1, name="result") + dag.commit(result) + + shown = dml.show() + logged = dml.log() + diffed = dml.diff("HEAD~1", "HEAD") + + assert set(shown.keys()) == {"revision", "commit", "dags", "change"} + assert set(logged.keys()) == {"revision", "commits"} + assert set(diffed.keys()) == {"left", "right", "added", "removed", "updated"} + assert "baseline" in shown["dags"] + + +def test_branch_lists_local_and_remote_tracking_views(tmp_path): + with temporary_dml(repo="source") as source: + remote_root = source._context.remote_root + source.push(None, branch="main", create=True, force=False) + source_uri = source.config.get("remote.project") + + with temporary_dml(repo="target", remote_root=remote_root) as target: + target.fetch(source_uri, None) + local = target.branch() + remote = target.branch(remote=True) + + assert local["remote"] is False + assert "main" in local["branches"] + assert remote["remote"] is True + assert any(branch.startswith("dml://") for branch in remote["branches"]) diff --git a/tests/contracts/internal/conftest.py b/tests/contracts/internal/conftest.py new file mode 100644 index 0000000..c13947b --- /dev/null +++ b/tests/contracts/internal/conftest.py @@ -0,0 +1,14 @@ +from tests.contracts.internal.support.conftest_support import ( # noqa: F401 + _aws_server, + aws_server, + clear_envvars, + db, + integration_remote_ops, + integration_remote_ops_fn, + remote_ops, + s3, + temp_bo, + temp_bo_fn, + temp_db, + temp_db_fn, +) diff --git a/tests/contracts/internal/ops/test_base_ops_contract.py b/tests/contracts/internal/ops/test_base_ops_contract.py new file mode 100644 index 0000000..d85d34c --- /dev/null +++ b/tests/contracts/internal/ops/test_base_ops_contract.py @@ -0,0 +1,114 @@ +"""Comprehensive tests for base_ops.py module with real database testing.""" + +from tempfile import TemporaryDirectory + +import pytest +from hypothesis import given + +from daggerml._internal._db import DmlDbEnv, Ref +from daggerml._internal.ops.base_ops import BaseOps, with_retry +from daggerml._internal.types import NAMESPACES, Deletable, ScalarDatum, Uri +from tests.contracts.internal.support.test_db_support import _gen_ref +from tests.contracts.internal.test_types_contract import DmlRepoError, _dml_obj_strategy + + +class TestBaseOps: + """Test BaseOps functionality.""" + + @given(_dml_obj_strategy()) + def test_putget_roundtrip(self, temp_bo, obj): + """Test successful private _get operation.""" + with temp_bo._tx(readonly=False) as ctx: + ref = ctx.put(obj) + with temp_bo._tx(readonly=True) as ctx: + retrieved_obj = ctx.get(ref) + assert retrieved_obj == obj + with temp_bo._tx(readonly=False) as ctx: + ctx.delete(ref) + + @given(_dml_obj_strategy()) + def test_delete(self, temp_bo, obj): + """Test successful private _delete operation.""" + with temp_bo._tx(readonly=False) as ctx: + ref = ctx.put(obj) + with temp_bo._tx(readonly=False) as ctx: + ctx.delete(ref) + with temp_bo._tx(readonly=True) as ctx: + with pytest.raises(DmlRepoError, match="Object not found:"): + ctx.get(ref) + + @given(_dml_obj_strategy()) + def test_iter(self, temp_bo, obj): + """Test successful private _get operation.""" + with temp_bo._tx(readonly=False) as ctx: + ref = ctx.put(obj) + with temp_bo._tx(readonly=True) as ctx: + assert [ref] == list(ctx.iter(ref.ns())) + temp_bo._db.clear_all() + + @given(_dml_obj_strategy()) + def test_exists(self, temp_bo, obj): + """Test successful private _get operation.""" + with temp_bo._tx(readonly=False) as ctx: + ref = ctx.put(obj) + with temp_bo._tx(readonly=True) as ctx: + assert ctx.exists(ref) + with temp_bo._tx(readonly=False) as ctx: + ctx.delete(ref) + with temp_bo._tx(readonly=True) as ctx: + assert not ctx.exists(ref) + temp_bo._db.clear_all() + + def test_get_error(self, temp_bo): + """Test private _get operation with error.""" + with pytest.raises(DmlRepoError, match="Object not found:"): + with temp_bo._tx(readonly=True) as ctx: + # Use a valid namespace so we exercise the not-found path. + ctx.get(_gen_ref("commit")) + + def test_with_retry_retries_whole_transaction_on_map_full(self): + """Map-full should resize and retry the whole operation, not a single put.""" + + class ResizeHarness(BaseOps): + def __post_init__(self): + super().__post_init__() + self.attempts = 0 + + @with_retry + def write_pair(self): + self.attempts += 1 + with self._tx(readonly=False) as ctx: + first = ctx.put(ScalarDatum(data="first")) + second = ctx.put(ScalarDatum(data="x" * 700_000)) + return first, second + + with TemporaryDirectory() as temp_dir: + db = DmlDbEnv.create(temp_dir, namespaces=sorted(NAMESPACES), map_size=256 * 1024) + ops = ResizeHarness(db) + first_ref, second_ref = ops.write_pair() + assert ops.attempts >= 2 + with ops._tx(readonly=True) as ctx: + first_obj = ctx.get(first_ref) + second_obj = ctx.get(second_ref) + assert first_obj.data == "first" + assert len(second_obj.data) == 700_000 + + def test_uri_and_deletable_are_mutually_exclusive(self, temp_bo): + uri = Uri(uri="s3://bucket/key") + deletable = Deletable(uri=uri.uri) + + with temp_bo._tx(readonly=False) as ctx: + uri_ref = ctx.put(uri) + assert uri_ref.ns() == "datum-uri" + assert ctx.exists(uri_ref) + assert not ctx.exists(Ref(f"deletable:{uri_ref.id()}")) + + deletable_ref = ctx.put(deletable) + assert deletable_ref.ns() == "deletable" + assert ctx.exists(deletable_ref) + assert not ctx.exists(uri_ref) + + uri_ref_2 = ctx.put(uri) + assert uri_ref_2.id() == uri_ref.id() + assert ctx.exists(uri_ref_2) + assert not ctx.exists(deletable_ref) diff --git a/tests/contracts/internal/ops/test_config_ops_contract.py b/tests/contracts/internal/ops/test_config_ops_contract.py new file mode 100644 index 0000000..a1eef06 --- /dev/null +++ b/tests/contracts/internal/ops/test_config_ops_contract.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import pytest + +from daggerml._internal.ops.config import SCOPE_GLOBAL, SCOPE_LOCAL, ConfigOps +from daggerml._internal.types import DmlRepoError + + +def test_config_ops_set_get_local_remote_project(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + ops.set("remote.project", "dml://alice/demo", scope=SCOPE_LOCAL) + assert ops.get("remote.project", scope=SCOPE_LOCAL) == "dml://alice/demo" + + +def test_config_ops_rejects_selector_bearing_local_remote_project(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + with pytest.raises(ValueError, match="must not include a branch or tag"): + ops.set("remote.project", "dml://alice/demo@v1", scope=SCOPE_LOCAL) + + +def test_config_ops_set_get_global_user(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + ops.set("user", "alice@host", scope=SCOPE_GLOBAL) + assert ops.get("user", scope=SCOPE_GLOBAL) == "alice@host" + + +def test_config_ops_validates_scope_restrictions(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + with pytest.raises(DmlRepoError, match="not valid in global scope"): + ops.set("remote.project", "dml://alice/demo", scope=SCOPE_GLOBAL) + with pytest.raises(DmlRepoError, match="not valid in local scope"): + ops.set("user", "alice@host", scope=SCOPE_LOCAL) + + +def test_config_ops_remote_fetch_workers_set_get_local(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + ops.set("remote.fetch_workers", "12", scope=SCOPE_LOCAL) + assert ops.get("remote.fetch_workers", scope=SCOPE_LOCAL) == "12" + + +def test_config_ops_set_get_local_remote_root(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + ops.set("remote.root", "s3://bucket/prefix", scope=SCOPE_LOCAL) + assert ops.get("remote.root", scope=SCOPE_LOCAL) == "s3://bucket/prefix" + + +def test_config_ops_remote_fetch_workers_rejects_invalid(tmp_path): + ops = ConfigOps(project_home=str(tmp_path), config_home=str(tmp_path / "cfg")) + with pytest.raises(DmlRepoError, match="positive integer"): + ops.set("remote.fetch_workers", "0", scope=SCOPE_LOCAL) diff --git a/tests/contracts/internal/ops/test_dag_ops_contract.py b/tests/contracts/internal/ops/test_dag_ops_contract.py new file mode 100644 index 0000000..a5ccd09 --- /dev/null +++ b/tests/contracts/internal/ops/test_dag_ops_contract.py @@ -0,0 +1,186 @@ +import pytest +from hypothesis import assume, given, settings +from hypothesis import strategies as st + +from daggerml._internal._db import Ref +from daggerml._internal.ops.dag import DagOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import ( + ArgvNode, + Commit, + Dag, + DictDatum, + DmlRepoError, + KwargvNode, + ListDatum, + LiteralNode, + ScalarDatum, + Tree, +) +from tests.contracts.internal.support.test_db_support import REF_ALPHABET, _gen_ref +from tests.contracts.internal.test_types_contract import _dag_strategy, _refs + + +def _put_dag(temp_bo, dag, data) -> Ref: + ref = data.draw(_refs("dag")) + with temp_bo._tx(readonly=False) as txn: + txn.put(dag, to=ref) + return ref + + +def setup(temp_bo, dag, data): + dag_ref = dag and _put_dag(temp_bo, dag, data) + # Create a fake commit/tree context + tree_ref = data.draw(_refs("tree")) + commit_ref = data.draw(_refs("commit")) + head_ref = data.draw(_refs("head")) + with temp_bo._tx(readonly=False) as txn: + txn.put(Tree(dags={"main": dag_ref} if dag_ref else {}), to=tree_ref) + txn.put(Commit(parents=[], tree=tree_ref, author="test", message="test commit"), to=commit_ref) + head_ops = HeadOps(_db=temp_bo._db) + try: + head_ops.delete_branch(head_ref.id()) + except DmlRepoError: + pass + head_ops.create_branch(head_ref.id(), commit_ref) + return dag_ref, tree_ref, commit_ref, head_ref + + +class TestDagOps: + def test_list_and_describe(self, temp_bo): + with temp_bo._tx(readonly=False) as txn: + datum_ref = txn.put(ScalarDatum(data=1)) + node_ref = txn.put(LiteralNode(value=datum_ref)) + argv_datum_ref = txn.put(ListDatum(data=[])) + argv_node_ref = txn.put(ArgvNode(value=argv_datum_ref)) + kwargv_datum_ref = txn.put(DictDatum(data={})) + kwargv_node_ref = txn.put(KwargvNode(value=kwargv_datum_ref)) + dag = Dag( + nodes=[node_ref, argv_node_ref, kwargv_node_ref], + names={"result": node_ref}, + result=node_ref, + argv=argv_node_ref, + ) + dag_ref = txn.put(dag) + tree_ref = txn.put(Tree(dags={"main": dag_ref})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="test commit")) + HeadOps(_db=temp_bo._db).create_branch("main", commit_ref) + refs = dag_ref, tree_ref, commit_ref + ops = DagOps(temp_bo._db) + # List DAGs in this commit + dags = ops.list() + assert isinstance(dags, list) + assert any(d["id"] == dag_ref.id() for d in dags) + # Describe the DAG + desc = ops.describe(dag_ref) + assert desc["id"] == dag_ref.id() + assert desc["nodes"] == dag.nodes + assert desc["names"] == dag.names + assert desc["result"] == dag.result + assert desc["argv"] == dag.argv + # Clean up + with temp_bo._tx(readonly=False) as txn: + for ref in set(refs): + if ref: + txn.delete(ref) + + def test_list_empty(self, temp_bo): + ops = DagOps(temp_bo._db) + # No heads/commits + assert ops.list() == [] + + @pytest.mark.parametrize( + "arg,msg", + [ + (_gen_ref("node"), "Expected dag ref"), + (_gen_ref("dag"), r"Object not found: Ref\(dag"), # non-existent dag ref + ], + ) + def test_describe_invalid_ref_raises(self, temp_bo, arg, msg): + """describe() should raise ValueError when given a non-dag Ref.""" + ops = DagOps(temp_bo._db) + with pytest.raises(DmlRepoError, match=msg): + ops.describe(arg) + + def test_describe_missing_dag_raises(self, temp_bo): + """describe() should raise DmlRepoError when the dag ref does not exist.""" + ops = DagOps(temp_bo._db) + missing = _gen_ref("dag") + with pytest.raises(DmlRepoError): + ops.describe(missing) + + @given(_dag_strategy().filter(lambda d: bool(d.nodes) and bool(d.names) and d.result is not None), st.data()) + @settings(max_examples=10) + def test_get_node_happy_path(self, temp_bo, dag, data): + """get_node should return the named node for a finished DAG.""" + dag_ref, tree_ref, commit_ref, head_ref = setup(temp_bo, dag, data) + try: + ops = DagOps(temp_bo._db) + name = next(iter(dag.names)) + node_ref = ops.get_node(dag_ref, name) + assert node_ref == dag.names[name] + finally: + HeadOps(_db=temp_bo._db).delete_branch(head_ref.id()) + with temp_bo._tx(readonly=False) as txn: + for ref in {dag_ref, tree_ref, commit_ref}: + if ref: + txn.delete(ref) + + @given(_dag_strategy().filter(lambda d: d.is_finished()), st.text(alphabet=REF_ALPHABET, min_size=1, max_size=16)) + def test_get_node_not_found_raises(self, temp_bo, dag, name): + """get_node should raise if the named node is not present.""" + assume(name not in dag.names) + with temp_bo._tx(readonly=False) as txn: + dag_ref = txn.put(dag) + try: + ops = DagOps(temp_bo._db) + with pytest.raises(DmlRepoError): + ops.get_node(dag_ref, name) + finally: + with temp_bo._tx(readonly=False) as txn: + txn.delete(dag_ref) + + @given(_dag_strategy().filter(lambda d: (not d.is_finished()) and bool(d.nodes))) + def test_get_node_unfinished_raises(self, temp_bo, dag): + """get_node should raise if the DAG is not finished.""" + ops = DagOps(temp_bo._db) + # insert the unfinished DAG directly; we don't need a commit/head context + with temp_bo._tx(readonly=False) as txn: + dag_ref = txn.put(dag) + try: + with pytest.raises(DmlRepoError): + ops.get_node(dag_ref, "missing_name") + finally: + with temp_bo._tx(readonly=False) as txn: + txn.delete(dag_ref) + + def test_get_argv_happy_path(self, temp_bo): + """get_argv should return the argv node when present.""" + # create a dag with argv present and insert it directly + with temp_bo._tx(readonly=False) as txn: + argv_datum_ref = txn.put(ListDatum(data=[])) + argv_node_ref = txn.put(ArgvNode(value=argv_datum_ref)) + kwargv_datum_ref = txn.put(DictDatum(data={})) + kwargv_node_ref = txn.put(KwargvNode(value=kwargv_datum_ref)) + dag_ref = txn.put(Dag(nodes=[argv_node_ref, kwargv_node_ref], names={}, result=None, argv=argv_node_ref)) + try: + ops = DagOps(temp_bo._db) + assert ops.get_argv(dag_ref) == argv_node_ref + finally: + with temp_bo._tx(readonly=False) as txn: + for r in (dag_ref, argv_node_ref, kwargv_node_ref, argv_datum_ref, kwargv_datum_ref): + if r: + txn.delete(r) + + def test_get_argv_missing_raises(self, temp_bo): + """get_argv should raise when the DAG has no argv node.""" + # insert a dag without argv + with temp_bo._tx(readonly=False) as txn: + dag_ref = txn.put(Dag(nodes=[], names={}, result=None, argv=None)) + try: + ops = DagOps(temp_bo._db) + with pytest.raises(DmlRepoError, match="DAG has no argv node"): + ops.get_argv(dag_ref) + finally: + with temp_bo._tx(readonly=False) as txn: + txn.delete(dag_ref) diff --git a/tests/contracts/internal/ops/test_dml_project_workflows_contract.py b/tests/contracts/internal/ops/test_dml_project_workflows_contract.py new file mode 100644 index 0000000..ed0629c --- /dev/null +++ b/tests/contracts/internal/ops/test_dml_project_workflows_contract.py @@ -0,0 +1,526 @@ +import logging +from contextlib import contextmanager +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import Mock, PropertyMock, patch + +import pytest + +import daggerml._internal.dml as dml_module +from daggerml._internal._db import Ref +from daggerml._internal.config import DmlProjectConfig +from daggerml._internal.dml import Dml +from daggerml._internal.types import DmlRepoError + + +@contextmanager +def _opened_db(db=None): + yield db if db is not None else Mock() + + +def test_fetch_pull_push_workflows_delegate_to_remote_ops(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + remote_ops = Mock() + head_ops = Mock() + head_ops.get_attached_head_branch.return_value = "main" + head_ops.require_attached_head_branch.return_value = "main" + project_cfg = SimpleNamespace(owner="alice", name="demo", uri="dml://alice/demo", remote_project="dml://alice/demo") + with ( + patch("daggerml._internal.dml_context.DmlProjectConfig.load", return_value=project_cfg), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_head_ops", return_value=head_ops), + patch.object(dml_module, "make_remote_ops", return_value=remote_ops), + ): + remote_ops.fetch_uri.return_value = Ref("commit:1") + remote_ops.pull_uri_into_branch.return_value = Ref("commit:2") + remote_ops.push_project_branch.return_value = "projects/alice/demo/heads/main.json" + remote_ops.push_project_tag.return_value = "projects/alice/demo/tags/v1.0.json" + + fetched = ops.fetch("origin", None) + pulled = ops.pull("origin", None, branch=None, user="alice") + pushed = ops.push(None, branch=None, create=False, force=False) + pushed_tag = ops.push("v1.0", branch=None, create=False, force=False) + + remote_ops.fetch_uri.assert_called_once_with("dml://alice/demo#main") + remote_ops.pull_uri_into_branch.assert_called_once_with("dml://alice/demo#main", "main", user="alice") + remote_ops.push_project_branch.assert_called_once_with("dml://alice/demo#main", "main", create=False, force=False) + remote_ops.push_project_tag.assert_called_once_with("dml://alice/demo@v1.0", "main") + assert fetched == Ref("commit:1") + assert pulled == Ref("commit:2") + assert pushed == "projects/alice/demo/heads/main.json" + assert pushed_tag == "projects/alice/demo/tags/v1.0.json" + + +def test_project_workflows_use_dml_owned_s3_client(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + remote_ops = Mock() + head_ops = Mock() + head_ops.get_attached_head_branch.return_value = "main" + head_ops.require_attached_head_branch.return_value = "main" + project_cfg = SimpleNamespace(owner="alice", name="demo", uri="dml://alice/demo", remote_project="dml://alice/demo") + with ( + patch("daggerml._internal.dml_context.DmlProjectConfig.load", return_value=project_cfg), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_head_ops", return_value=head_ops), + patch.object(dml_module, "make_remote_ops", return_value=remote_ops), + ): + remote_ops.fetch_uri.return_value = Ref("commit:1") + remote_ops.pull_uri_into_branch.return_value = Ref("commit:2") + remote_ops.push_project_branch.return_value = "projects/alice/demo/heads/main.json" + + fetched = ops.fetch("origin", None) + pulled = ops.pull("origin", None, branch=None, user="alice") + pushed = ops.push(None, branch=None, create=False, force=False) + + remote_ops.fetch_uri.assert_called_once_with("dml://alice/demo#main") + remote_ops.pull_uri_into_branch.assert_called_once_with("dml://alice/demo#main", "main", user="alice") + remote_ops.push_project_branch.assert_called_once_with("dml://alice/demo#main", "main", create=False, force=False) + assert fetched == Ref("commit:1") + assert pulled == Ref("commit:2") + assert pushed == "projects/alice/demo/heads/main.json" + assert ops._s3_client is not None + + +def test_fetch_project_origin_falls_back_to_default_branch_without_attached_head(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + remote_ops = Mock() + project_cfg = SimpleNamespace(owner="alice", name="demo", uri="dml://alice/demo", remote_project="dml://alice/demo") + detached_head_ops = Mock(get_attached_head_branch=Mock(return_value=None)) + with ( + patch("daggerml._internal.dml_context.DmlProjectConfig.load", return_value=project_cfg), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_head_ops", return_value=detached_head_ops), + patch.object(dml_module, "make_remote_ops", return_value=remote_ops), + ): + remote_ops.fetch_uri.return_value = Ref("commit:1") + fetched = ops.fetch("origin", None) + + remote_ops.fetch_uri.assert_called_once_with("dml://alice/demo#main") + assert fetched == Ref("commit:1") + + +def test_push_project_requires_attached_head_or_explicit_branch(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + detached_error = DmlRepoError("Current checkout is detached; attach HEAD or pass an explicit branch") + project_cfg = SimpleNamespace(owner="alice", name="demo", uri="dml://alice/demo") + detached_head_ops = Mock(require_attached_head_branch=Mock(side_effect=detached_error)) + with ( + patch("daggerml._internal.dml.load_project_config", return_value=project_cfg), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_head_ops", return_value=detached_head_ops), + ): + with pytest.raises(DmlRepoError, match="Current checkout is detached"): + ops.push(None, branch=None, create=False, force=False) + + +def test_checkout_merge_revert_workflows_delegate_to_commit_ops(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + commit_ops = Mock() + commit_ops.merge_into_head.return_value = Ref("commit:3") + commit_ops.revert.return_value = Ref("commit:4") + head_ops = Mock() + head_ops.require_attached_head_branch.return_value = "main" + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_commit_ops", return_value=commit_ops), + patch.object(dml_module, "make_head_ops", return_value=head_ops), + patch.object( + dml_module, + "resolve_dml_revision", + return_value=SimpleNamespace(commit=Ref("commit:1"), kind="branch", branch="feature"), + ), + patch.object(dml_module, "resolve_dml_revision_ref", return_value=Ref("commit:2")), + ): + checkout = ops.checkout("feature") + merged = ops.merge("origin/main", branch=None, user="alice") + reverted = ops.revert("origin/main", branch=None, user="alice") + + commit_ops.merge_into_head.assert_called_once_with("main", Ref("commit:2"), "alice") + commit_ops.revert.assert_called_once_with("main", Ref("commit:2"), "alice") + head_ops.write_attached_head.assert_called_once_with("feature") + assert checkout["mode"] == "attached" + assert merged == Ref("commit:3") + assert reverted == Ref("commit:4") + + +def test_dag_checkout_delegates_to_commit_ops_with_resolved_defaults(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix", user="alice") + commit_ops = Mock() + commit_ops.checkout_dag.return_value = Ref("commit:3") + head_ops = Mock() + head_ops.require_attached_head_branch.return_value = "main" + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_commit_ops", return_value=commit_ops), + patch.object(dml_module, "make_head_ops", return_value=head_ops), + patch.object(dml_module, "resolve_dml_revision_ref", return_value=Ref("commit:2")), + ): + result = ops.dag.checkout("origin/main", "train") + + commit_ops.checkout_dag.assert_called_once_with( + "main", + Ref("commit:2"), + "train", + target_name=None, + replace=False, + user="alice", + ) + assert result == Ref("commit:3") + + +def test_dag_checkout_requires_user_if_not_resolved(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix", user=None) + with ( + patch.object(type(ops._context), "user", new_callable=PropertyMock, return_value=None), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object( + dml_module, "make_head_ops", return_value=Mock(require_attached_head_branch=Mock(return_value="main")) + ), + patch.object(dml_module, "resolve_dml_revision_ref", return_value=Ref("commit:2")), + ): + with pytest.raises(DmlRepoError, match="user is required for dag checkout"): + ops.dag.checkout("origin/main", "train") + + +def test_runtime_cancel_runs_retry_loop_and_returns_stats(caplog): + caplog.set_level(logging.INFO) + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix", user="alice") + index = Mock() + index.cancel.return_value = { + "index_id": "idx-1", + "requested_by": "alice", + "cancelled_path": Path("/tmp/cancelled/idx-1"), + "graph": {("idx-1", "exec-1"), ("idx-1", "exec-2")}, + "candidate_set": {"exec-1", "exec-2"}, + "own_executions": {"exec-1", "exec-2"}, + } + outcomes = { + (2, "exec-1"): {"execution_id": "exec-1", "outcome": None, "lock_retry": True, "cancel_requested": False}, + (2, "exec-2"): {"execution_id": "exec-2", "outcome": -1, "lock_retry": False, "cancel_requested": False}, + (1, "exec-1"): {"execution_id": "exec-1", "outcome": 1, "lock_retry": False, "cancel_requested": True}, + } + + def _cancel_candidate(execution_id, *, requested_by, own_executions): + assert requested_by == "alice" + return outcomes[(len(own_executions), execution_id)] + + index._cancel_execution_candidate.side_effect = _cancel_candidate + sleeps = [] + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_index_ops", return_value=index), + patch.object(dml_module.time, "sleep", side_effect=lambda delay: sleeps.append(delay)), + ): + result = ops.runtime.cancel("idx-1") + + index.cancel.assert_called_once_with("idx-1", requested_by="alice") + index._complete_index_cancellation.assert_called_once_with( + "idx-1", + cancelled_path=Path("/tmp/cancelled/idx-1"), + own_executions={"exec-1"}, + ) + assert result == { + "index_id": "idx-1", + "iterations": 2, + "graph_edges": 2, + "candidate_count": 2, + "own_execution_count": 1, + "cancelled_count": 1, + "dropped_count": 1, + "lock_retry_count": 1, + } + assert sleeps == [0.05] + assert "runtime.cancel iteration=1 index_id=idx-1 candidates=2 owned=2" in caplog.text + + +def test_runtime_cancel_retries_candidate_errors_with_backoff(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix", user="alice") + index = Mock() + index.cancel.return_value = { + "index_id": "idx-1", + "requested_by": "alice", + "cancelled_path": Path("/tmp/cancelled/idx-1"), + "graph": {("idx-1", "exec-1")}, + "candidate_set": {"exec-1"}, + "own_executions": {"exec-1"}, + } + attempts = {"count": 0} + + def _cancel_candidate(execution_id, *, requested_by, own_executions): + assert execution_id == "exec-1" + assert requested_by == "alice" + assert own_executions == {"exec-1"} + attempts["count"] += 1 + if attempts["count"] < 3: + raise DmlRepoError("boom") + return {"execution_id": "exec-1", "outcome": 1, "lock_retry": False, "cancel_requested": True} + + index._cancel_execution_candidate.side_effect = _cancel_candidate + sleeps = [] + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_index_ops", return_value=index), + patch.object(dml_module.time, "sleep", side_effect=lambda delay: sleeps.append(delay)), + ): + result = ops.runtime.cancel("idx-1") + + assert attempts["count"] == 3 + assert sleeps == [0.05, 0.1] + assert result["iterations"] == 3 + assert result["cancelled_count"] == 1 + + +def test_dag_describe_node_resolves_named_node_with_revision_context(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + revision = SimpleNamespace(commit=Ref("commit:2"), kind="branch", branch="main", tag=None) + node_ops = Mock(describe=Mock(return_value={"id": "4", "ref": Ref("node:4"), "type": "LiteralNode"})) + + with ( + patch( + "daggerml._internal.dml.resolve_node_ref", + return_value=SimpleNamespace(ref=Ref("node:4"), dag="train", revision=revision), + ), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_node_ops", return_value=node_ops), + ): + result = ops.dag.describe_node("result", dag="train", revision="HEAD") + + node_ops.describe.assert_called_once_with(Ref("node:4")) + assert result == { + "revision": {"input": "HEAD", "kind": "branch", "commit": Ref("commit:2"), "branch": "main", "tag": None}, + "node": {"ref": Ref("node:4"), "type": "LiteralNode"}, + } + + +def test_dag_get_node_resolves_named_node_with_explicit_dag_ref(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + node_ops = Mock(get=Mock(return_value={"answer": Ref("datum:5")})) + + with ( + patch( + "daggerml._internal.dml.resolve_node_ref", + return_value=SimpleNamespace(ref=Ref("node:4"), dag="train", revision=None), + ), + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_node_ops", return_value=node_ops), + ): + result = ops.dag.get_node("result", dag="train") + + node_ops.get.assert_called_once_with(Ref("node:4")) + assert result == {"node": {"answer": Ref("datum:5")}} + + +def test_dag_describe_node_accepts_explicit_node_ref_without_dag_context(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + node_ref = Ref("node-literal:4") + node_ops = Mock(describe=Mock(return_value={"id": "4", "ref": node_ref, "type": "LiteralNode"})) + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_node_ops", return_value=node_ops), + ): + result = ops.dag.describe_node(node_ref) + + node_ops.describe.assert_called_once_with(node_ref) + assert result == {"node": {"ref": node_ref, "type": "LiteralNode"}} + + +def test_dag_get_node_accepts_explicit_node_ref_without_dag_context(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + node_ref = Ref("node-fn:4") + node_ops = Mock(get=Mock(return_value={"answer": Ref("datum:5")})) + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_node_ops", return_value=node_ops), + ): + result = ops.dag.get_node(node_ref) + + node_ops.get.assert_called_once_with(node_ref) + assert result == {"node": {"answer": Ref("datum:5")}} + + +def test_dag_get_node_rejects_ref_like_node_string_with_dag_context(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + + with pytest.raises(DmlRepoError, match="Expected node Ref"): + with patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()): + ops.dag.get_node("node-fn:4", dag="train", revision="HEAD") + + +def test_dag_describe_node_uses_explicit_dag_ref_context_for_named_lookup(): + ops = Dml(project_home="/repo", remote_root="s3://bucket/prefix") + dag_ref = Ref("dag:3") + node_ops = Mock(describe=Mock(return_value={"id": "4", "ref": Ref("node:4"), "type": "LiteralNode"})) + dag_ops = Mock(get_node=Mock(return_value=Ref("node:4"))) + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_node_ops", return_value=node_ops), + patch.object(dml_module, "make_dag_ops", return_value=dag_ops), + ): + result = ops.dag.describe_node("result", dag=dag_ref) + + dag_ops.get_node.assert_called_once_with(dag_ref, "result") + assert result == {"node": {"ref": Ref("node:4"), "type": "LiteralNode"}} + + +def test_dml_init_recovers_when_config_exists_and_db_missing(tmp_path): + repo_dir = tmp_path / "repo" + dml_dir = repo_dir / ".dml" + dml_dir.mkdir(parents=True) + (dml_dir / "config.toml").write_text('[remote]\nproject = "dml://alice/demo"\nroot = "s3://bucket/prefix"\n') + + with ( + patch("daggerml._internal.dml.Dml.fetch", return_value=Ref("commit:9")) as mock_fetch, + ): + result = Dml.init(str(repo_dir), remote_root="s3://bucket/prefix") + + mock_fetch.assert_called_once_with("origin", None) + assert (dml_dir / "db").exists() + assert result["created"] == {"db": True, "config": False} + + +def test_dml_boundary_keeps_only_allowed_private_helpers(): + dml = Dml(project_home="/repo") + + assert hasattr(dml, "_context") + assert hasattr(dml, "_s3_client") + assert not hasattr(dml, "_with_ops") + assert not hasattr(dml, "_head_ops") + assert not hasattr(dml, "_commit_ops") + assert not hasattr(dml, "_resolve_revision") + + for namespace in (dml.config, dml.runtime, dml.dag, dml.admin): + assert hasattr(namespace, "_dml") + assert not hasattr(namespace, "_selector_payload") + + +def test_dml_init_uses_init_project_layout_for_bootstrap(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + with patch("daggerml._internal.dml.Dml.fetch", return_value=Ref("commit:9")) as mock_fetch: + result = Dml.init( + str(repo_dir), + remote_project="dml://alice/demo", + remote_root="s3://bucket/prefix", + user="alice@example-host", + ) + + init_cfg = DmlProjectConfig.load(repo_dir) + assert init_cfg.name == "demo" + assert init_cfg.owner == "alice" + assert init_cfg.remote_project == "dml://alice/demo" + assert init_cfg.remote_root == "s3://bucket/prefix" + assert (repo_dir / ".dml").is_dir() + assert (repo_dir / ".dml" / "config.toml").exists() + assert (repo_dir / ".dml" / "db").exists() + mock_fetch.assert_called_once_with("origin", None) + assert result["project_home"] == str(repo_dir.resolve()) + assert result["remote_root"] == "s3://bucket/prefix" + assert result["user"] == "alice@example-host" + assert result["created"] == {"db": True, "config": True} + + +def test_dml_init_allows_local_only_bootstrap(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + with ( + patch.dict("os.environ", {}, clear=True), + patch("daggerml._internal.dml.Dml.fetch") as mock_fetch, + ): + result = Dml.init(str(repo_dir)) + + project_cfg = DmlProjectConfig.load(repo_dir) + assert project_cfg.remote_project is None + assert project_cfg.remote_root == "" + mock_fetch.assert_not_called() + assert result["created"] == {"db": True, "config": True} + + +def test_dml_init_allows_remote_root_without_remote_project(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + with patch("daggerml._internal.dml.Dml.fetch") as mock_fetch: + result = Dml.init(str(repo_dir), remote_root="s3://bucket/prefix") + + project_cfg = DmlProjectConfig.load(repo_dir) + assert project_cfg.remote_project is None + assert project_cfg.remote_root == "s3://bucket/prefix" + mock_fetch.assert_not_called() + assert result["remote_root"] == "s3://bucket/prefix" + + +def test_dml_init_rejects_remote_project_without_remote_root(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(DmlRepoError, match="remote.root is required"): + Dml.init(str(repo_dir), remote_project="dml://alice/demo") + + +def test_dml_init_resolves_remote_root_from_env_for_remote_project(tmp_path, monkeypatch): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + monkeypatch.setenv("DML_REMOTE_ROOT", "s3://bucket/prefix") + + with patch("daggerml._internal.dml.Dml.fetch", return_value=Ref("commit:9")) as mock_fetch: + result = Dml.init(str(repo_dir), remote_project="dml://alice/demo") + + project_cfg = DmlProjectConfig.load(repo_dir) + assert project_cfg.remote_project == "dml://alice/demo" + assert project_cfg.remote_root == "s3://bucket/prefix" + assert result["remote_root"] == "s3://bucket/prefix" + mock_fetch.assert_called_once_with("origin", None) + + +def test_dml_init_requires_remote_root_for_recovery_pull(tmp_path): + repo_dir = tmp_path / "repo" + dml_dir = repo_dir / ".dml" + dml_dir.mkdir(parents=True) + (dml_dir / "config.toml").write_text('[remote]\nproject = "dml://alice/demo"\n') + + with pytest.raises(DmlRepoError, match="remote.root is required"): + Dml.init(str(repo_dir), remote_root="") + + +def test_dml_init_requires_existing_project_directory(tmp_path): + missing = tmp_path / "missing" + with pytest.raises(FileNotFoundError, match="does not exist"): + Dml.init(str(missing), remote_project="dml://alice/demo", remote_root="s3://bucket/prefix") + + +def test_project_sync_requires_remote_project(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + Dml.init(str(repo_dir), remote_root="s3://bucket/prefix") + dml = Dml(project_home=str(repo_dir), remote_root="s3://bucket/prefix") + + with pytest.raises(DmlRepoError, match="remote.project is required for project sync"): + dml.fetch("origin", None) + + with pytest.raises(DmlRepoError, match="remote.project is required for project sync"): + dml.pull("origin", None, branch="main", user="alice") + + with pytest.raises(DmlRepoError, match="remote.project is required for project sync"): + dml.push(None, branch="main", create=False, force=False) + + +def test_remote_root_only_repo_can_create_indexes(tmp_path): + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + Dml.init(str(repo_dir), remote_root="s3://bucket/prefix") + dml = Dml(project_home=str(repo_dir), remote_root="s3://bucket/prefix") + + with patch("daggerml._internal.exec_state.ExecutionState.update_execution_record", return_value={}): + index_id = dml.runtime.create() + node = dml.runtime.put_literal(index_id, 42, name="answer") + + assert index_id + assert node.ns() == "node-literal" diff --git a/tests/contracts/internal/ops/test_gc_ops_contract.py b/tests/contracts/internal/ops/test_gc_ops_contract.py new file mode 100644 index 0000000..ff8e74d --- /dev/null +++ b/tests/contracts/internal/ops/test_gc_ops_contract.py @@ -0,0 +1,55 @@ +import pytest +from hypothesis import given, settings + +from daggerml._internal.ops.gc import GcOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import Commit, Dag, DmlRepoError, LiteralNode, ScalarDatum, Tree +from tests.contracts.internal.support.test_db_support import _gen_ref +from tests.contracts.internal.test_types_contract import _refs + + +class TestGcOps: + def test_gc_and_list_orphans(self, temp_bo): + try: + with temp_bo._tx(readonly=False) as txn: + datum_ref = txn.put(ScalarDatum(data=42)) + node_ref = txn.put(LiteralNode(value=datum_ref)) + dag_ref = txn.put(Dag([node_ref], {}, node_ref)) + # datum_ref = _put_datum(temp_bo, 42) + tree = Tree(dags={"main": dag_ref}) + tree_ref = _gen_ref("tree") + commit_ref = _gen_ref("commit") + head_ref = _gen_ref("head") + with temp_bo._tx(readonly=False) as txn: + txn.put(tree, to=tree_ref) + txn.put(Commit(parents=[], tree=tree_ref, author="test", message="test commit"), to=commit_ref) + HeadOps(_db=temp_bo._db).create_branch(head_ref.id(), commit_ref) + ops = GcOps(temp_bo._db) + assert ops.list_orphans() == [] + with temp_bo._tx(readonly=False) as txn: + new_datum_ref = txn.put(ScalarDatum(data="orphan datum")) + assert ops.list_orphans() == [new_datum_ref] + stats = ops.gc() + assert "datum-scalar" in stats and stats["datum-scalar"] == 1 + assert ops.list_orphans() == [] + finally: + temp_bo._db.clear_all() + + @given(_refs("head", full=True)) + @settings(max_examples=1) + def test_gc_error(self, temp_bo, head_ref): + try: + HeadOps(_db=temp_bo._db)._write_pointer_commit( + HeadOps(_db=temp_bo._db)._local_branch_path(head_ref.id()), + _gen_ref("commit"), + ) + ops = GcOps(temp_bo._db) + with pytest.raises(DmlRepoError, match="^GC failed: Failed to list orphans: .*"): + ops.gc() # should raise because head points to non-existent commit + finally: + temp_bo._db.clear_all() + + def test_gc_empty(self, temp_bo): + ops = GcOps(temp_bo._db) + assert ops.list_orphans() == [] + assert ops.gc() == {} diff --git a/tests/contracts/internal/ops/test_git_like_project_ops_contract.py b/tests/contracts/internal/ops/test_git_like_project_ops_contract.py new file mode 100644 index 0000000..81e0bbf --- /dev/null +++ b/tests/contracts/internal/ops/test_git_like_project_ops_contract.py @@ -0,0 +1,158 @@ +from pathlib import Path + +import pytest + +from daggerml._internal.config import DmlProjectConfig, init_project_layout, normalize_project_uri +from daggerml._internal.dml_resolution import resolve_revision_ref +from daggerml._internal.ops.commit import CommitOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.remote import RemoteOps +from daggerml._internal.revision_uri import canonicalize_revision_uri, parse_revision_uri, stringify_revision_uri +from daggerml._internal.types import DmlPointerConflictError, DmlRepoError + + +def test_project_ref_paths_and_dml_uri_validation(remote_ops): + assert remote_ops._project_branch_ref_path("alice", "demo", "feature/x") == ( + "projects/alice/demo/heads/feature/x.json" + ) + assert remote_ops._project_tag_ref_path("alice", "demo", "v1.0") == "projects/alice/demo/tags/v1.0.json" + + +def test_shared_revision_uri_helpers_and_wrappers_are_compatible(): + parsed = parse_revision_uri("dml://alice/demo", default_branch="main") + assert stringify_revision_uri(parsed) == "dml://alice/demo#main" + assert canonicalize_revision_uri("dml://alice/demo", default_branch="main") == "dml://alice/demo#main" + assert normalize_project_uri("dml://alice/demo", default_branch="main", require_branch=True) == "dml://alice/demo#main" + assert normalize_project_uri("dml://alice/demo@v1", require_branch=False) == "dml://alice/demo@v1" + assert RemoteOps.canonical_dml_uri("dml://alice/demo@v1", require_identifier=True) == "dml://alice/demo@v1" + + +def test_project_config_layout_roundtrip(tmp_path: Path): + cfg = DmlProjectConfig( + name="demo", + owner="alice", + remote_root="s3://bucket/team/dml", + ) + db_path = init_project_layout(tmp_path, cfg) + + assert db_path == tmp_path / ".dml" / "db" + assert (tmp_path / ".dml" / ".gitignore").read_text() == "db\nHEAD\nrefs\n" + loaded = DmlProjectConfig.load(tmp_path) + assert loaded.name == "demo" + assert loaded.owner == "alice" + assert loaded.remote_root == "s3://bucket/team/dml" + assert loaded.remote_project == "dml://alice/demo" + + +def test_project_config_layout_allows_missing_remote_project(tmp_path: Path): + cfg = DmlProjectConfig(remote_root="s3://bucket/team/dml") + init_project_layout(tmp_path, cfg) + + loaded = DmlProjectConfig.load(tmp_path) + assert loaded.name is None + assert loaded.owner is None + assert loaded.remote_root == "s3://bucket/team/dml" + assert loaded.remote_project is None + + +def test_head_advance_and_revision_resolution(temp_bo_fn): + head_ops = HeadOps(_db=temp_bo_fn._db) + commit_ops = CommitOps(_db=temp_bo_fn._db) + head = head_ops.create_branch("feature") + commit = head_ops.get_branch_commit(head) + new_head = head_ops.create_branch("copy", commit) + + head_ops.update_branch_commit(new_head, head_ops.get_branch_commit(new_head), commit) + assert resolve_revision_ref(value="copy", commit_ops=commit_ops, head_ops=head_ops, project_dir=".") == commit + head_ops.write_attached_head("copy") + with pytest.raises(DmlRepoError, match="walks past the root commit"): + resolve_revision_ref(value="HEAD~1", commit_ops=commit_ops, head_ops=head_ops, project_dir=".") + + +def test_checkout_absent_dag_does_not_advance_head(temp_bo_fn): + head_ops = HeadOps(_db=temp_bo_fn._db) + commit_ops = CommitOps(_db=temp_bo_fn._db) + head = head_ops.create_branch("checkout") + commit = head_ops.get_branch_commit(head) + with pytest.raises(DmlRepoError, match="not found"): + commit_ops.checkout_dag(head, commit, "missing", user="alice") + assert head_ops.get_branch_commit(head) == commit + + +def test_detached_commit_does_not_advance_branch_head_and_reattach_resumes(temp_bo_fn): + head_ops = HeadOps(_db=temp_bo_fn._db) + index_ops = IndexOps(_db=temp_bo_fn._db, remote_root="") + main_head = head_ops.create_branch("main") + start = head_ops.get_branch_commit(main_head) + + detached_index = index_ops.create(head=main_head) + node = index_ops.put_literal(detached_index, 42) + detached_commit = index_ops.commit(detached_index, node, head=None, message="detached") + assert head_ops.get_branch_commit(main_head) == start + + attached_index = index_ops.create(head=main_head) + node2 = index_ops.put_literal(attached_index, 84) + attached_commit = index_ops.commit(attached_index, node2, head=main_head, message="attached") + assert head_ops.get_branch_commit(main_head) == attached_commit + assert attached_commit != detached_commit + + +def test_commit_lifecycle_stages_attached_detached_detached_reattach(temp_bo_fn): + head_ops = HeadOps(_db=temp_bo_fn._db) + index_ops = IndexOps(_db=temp_bo_fn._db, remote_root="") + main_head = head_ops.create_branch("main") + + # Stage 1: attached commit advances branch head. + start = head_ops.get_branch_commit(main_head) + idx1 = index_ops.create(head=main_head) + n1 = index_ops.put_literal(idx1, "s1") + c1 = index_ops.commit(idx1, n1, head=main_head, message="stage-1-attached") + assert head_ops.get_branch_commit(main_head) == c1 + + # Stage 2: detached commit from branch snapshot does not advance head. + idx2 = index_ops.create(head=main_head) + n2 = index_ops.put_literal(idx2, "s2") + c2 = index_ops.commit(idx2, n2, head=None, message="stage-2-detached") + assert head_ops.get_branch_commit(main_head) == c1 + + # Stage 3: detached commit from detached commit also does not advance any head. + detached_head = head_ops.create_branch("scratch", c2) + idx3 = index_ops.create(head=detached_head) + n3 = index_ops.put_literal(idx3, "s3") + _c3 = index_ops.commit(idx3, n3, head=None, message="stage-3-detached") + assert head_ops.get_branch_commit(main_head) == c1 + assert head_ops.get_branch_commit(detached_head) == c2 + + # Stage 4: re-attach and commit resumes branch progression. + idx4 = index_ops.create(head=main_head) + n4 = index_ops.put_literal(idx4, "s4") + c4 = index_ops.commit(idx4, n4, head=main_head, message="stage-4-reattach") + assert head_ops.get_branch_commit(main_head) == c4 + assert c4 != start + + +def test_headops_stale_pointer_updates_report_current_commit(temp_bo_fn): + head_ops = HeadOps(_db=temp_bo_fn._db) + index_ops = IndexOps(_db=temp_bo_fn._db, remote_root="") + + branch = head_ops.create_branch("main") + start = head_ops.get_branch_commit(branch) + + idx1 = index_ops.create(head=branch) + node1 = index_ops.put_literal(idx1, "first") + commit1 = index_ops.commit(idx1, node1, head=branch, message="first") + + with pytest.raises(DmlPointerConflictError) as branch_conflict: + head_ops.update_branch_commit(branch, start, commit1) + assert branch_conflict.value.current_commit == commit1 + + idx2 = index_ops.create(head=branch) + stale = head_ops.get_index_commit(idx2) + # write a second node (we don't need the returned ref here) + _ = index_ops.put_literal(idx2, "second") + latest = head_ops.get_index_commit(idx2) + + with pytest.raises(DmlPointerConflictError) as index_conflict: + head_ops.update_index_commit(idx2, stale, latest) + assert index_conflict.value.current_commit == latest diff --git a/tests/contracts/internal/ops/test_index_ops_contract.py b/tests/contracts/internal/ops/test_index_ops_contract.py new file mode 100644 index 0000000..7f1cb22 --- /dev/null +++ b/tests/contracts/internal/ops/test_index_ops_contract.py @@ -0,0 +1,1831 @@ +import json +import os +from contextlib import contextmanager +from pathlib import Path +from types import SimpleNamespace +from typing import Any, cast +from unittest.mock import patch + +import pytest +from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st + +import daggerml._internal.dml as dml_module +import daggerml.codecs as literal_codec +from daggerml import Dml, new +from daggerml._internal._db import Ref +from daggerml._internal.exec_state import CancelledExecutionError +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps, _PreparedAdapterCall +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import ( + ArgvNode, + Commit, + Dag, + DictDatum, + DmlPointerConflictError, + DmlRepoError, + Error, + ImportNode, + KwargvNode, + ListDatum, + LiteralNode, + Runnable, + RunnableDatum, + ScalarDatum, + Tree, + Uri, +) +from tests import temporary_dml +from tests.contracts.internal.support.conftest_support import remote_bucket_and_prefix_from_env +from tests.contracts.internal.support.test_db_support import ( + REF_ALPHABET, + _gen_ref, + scalar_strategy, +) +from tests.contracts.internal.support.util_support import TEST_DIR +from tests.contracts.internal.test_types_contract import _commit_strategy + +_NAME_STRAT = st.text(alphabet=REF_ALPHABET, min_size=1, max_size=12) +DELAYED_FN_URI = str(TEST_DIR / "delayed-sum.py") +PREPOP_FN_URI = str(TEST_DIR / "prepop.py") +ERROR_FN_URI = str(TEST_DIR / "adapter-error.py") +RAND_FN_URI = str(TEST_DIR / "rand.py") +SUM_FN_URI = str(TEST_DIR / "sum.py") +FN_ADAPTER = str(TEST_DIR / "python-fork-adapter.py") + + +def _remote_root_from_env() -> str: + return os.environ["DML_REMOTE_ROOT"] + + +def _remote_protocol_prefix_from_env() -> str: + _bucket, prefix = remote_bucket_and_prefix_from_env() + return f"{prefix}/dml" if prefix else "dml" + + +def _mk_repo_state(temp_bo, *, with_argv: bool = False) -> tuple[IndexOps, str, str]: + """Create a minimal head + working index context for IndexOps tests.""" + branch_name = _gen_ref("head").id() + tree_ref = _gen_ref("tree") + base_commit_ref = _gen_ref("commit") + index_dag_ref = _gen_ref("dag") + index_commit_ref = _gen_ref("commit") + index_id = _gen_ref("index").id() + with temp_bo._tx(readonly=False) as txn: + txn.put(Tree(dags={}), to=tree_ref) + txn.put(Commit(parents=[], tree=tree_ref, author="test", message="base"), to=base_commit_ref) + nodes: list[Ref] = [] + argv_node_ref: Ref | None = None + if with_argv: + argv_datum_ref = txn.put(ListDatum(data=[]), to=_gen_ref("datum-list")) + argv_node_ref = txn.put(ArgvNode(value=argv_datum_ref), to=_gen_ref("node", "argv")) + kwargv_datum_ref = txn.put(DictDatum(data={}), to=_gen_ref("datum-dict")) + kwargv_node_ref = txn.put(KwargvNode(value=kwargv_datum_ref), to=_gen_ref("node", "kwargv")) + nodes = [cast(Ref, argv_node_ref), kwargv_node_ref] + txn.put(Dag(nodes=nodes, names={}, result=None, argv=(argv_node_ref if with_argv else None)), to=index_dag_ref) + txn.put( + Commit( + parents=[base_commit_ref], + tree=tree_ref, + author="test", + message="working", + dag=index_dag_ref, + ), + to=index_commit_ref, + ) + head_ops = HeadOps(_db=temp_bo._db) + try: + head_ops.delete_branch(branch_name) + except DmlRepoError: + pass + try: + head_ops.delete_index(index_id) + except DmlRepoError: + pass + head_ops.create_branch(branch_name, base_commit_ref) + head_ops._create_pointer(head_ops._index_path(index_id), index_commit_ref) + return IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()), branch_name, index_id + + +def _mk_remote_index_ops(temp_bo) -> IndexOps: + return IndexOps( + _db=temp_bo._db, + remote_root=_remote_root_from_env(), + ) + + +def _unroll_datum(txn, ref: Ref): + datum = txn.get(ref) + if isinstance(datum, ListDatum): + return [_unroll_datum(txn, x) if isinstance(x, Ref) else x for x in datum.data] + if isinstance(datum, DictDatum): + return {k: _unroll_datum(txn, v) if isinstance(v, Ref) else v for k, v in datum.data.items()} + if isinstance(datum, ScalarDatum): + return datum.data + raise AssertionError(f"Unexpected datum type: {type(datum).__name__}") + + +def _put_runnable_literal(ops: IndexOps, index_ref: Ref, *, uri: str, adapter: str) -> Ref: + uri_node = ops.put_literal(index_ref, Uri(uri)) + defaults_node = ops.put_literal(index_ref, {}) + with ops._tx(readonly=True) as txn: + uri_ref = txn.get(uri_node).datum_ref(txn) + defaults_ref = txn.get(defaults_node).datum_ref(txn) + return ops.put_literal(index_ref, RunnableDatum(target=uri_ref, sub=None, kwargs=defaults_ref, adapter=adapter)) + + +class _FakeExecutionState: + lock_calls: list[str] = [] + unlock_calls: list[str] = [] + active: dict[str, str] = {} + launch_states: dict[str, dict[str, Any]] = {} + records: dict[str, dict[str, Any]] = {} + execution_edges: list[tuple[str, str]] = [] + + def __init__(self, cache_key: str, *, remote_root: str = ""): + self.cache_key = cache_key + + @classmethod + def reset(cls) -> None: + cls.lock_calls = [] + cls.unlock_calls = [] + cls.active = {} + cls.launch_states = {} + cls.records = {} + cls.execution_edges = [] + + def lock(self): + type(self).lock_calls.append(self.cache_key) + return True + + def unlock(self): + type(self).unlock_calls.append(self.cache_key) + return True + + def read_active_execution_id(self): + return type(self).active.get(self.cache_key) + + def create_active_execution(self, execution_id: str): + if self.cache_key in type(self).active: + return False + type(self).active[self.cache_key] = execution_id + return True + + def delete_active_execution(self): + type(self).active.pop(self.cache_key, None) + + def read_execution_record(self, execution_id: str): + return type(self).records.get(execution_id) + + def read_launch_state(self, execution_id: str): + return type(self).launch_states.get(execution_id) + + def create_launch_state(self, launch_state: dict[str, Any]): + if launch_state["execution_id"] in type(self).launch_states: + return False + type(self).launch_states[launch_state["execution_id"]] = launch_state + return True + + def update_launch_state(self, launch_state: dict[str, Any]): + type(self).launch_states[launch_state["execution_id"]] = launch_state + return launch_state + + def update_execution_record(self, record: dict[str, Any], *, retries: int = 8): + del retries + current = type(self).records.get(record["execution_id"]) + if current is None: + type(self).records[record["execution_id"]] = record + return record + rank = {"running": 0, "cancel-pending": 1, "cancel-detached": 2, "succeeded": 3, "failed": 3} + merged = dict(current) + merged["lifecycle"] = ( + record["lifecycle"] if rank[record["lifecycle"]] > rank[current["lifecycle"]] else current["lifecycle"] + ) + merged["spawned_execution_ids"] = sorted( + {*current.get("spawned_execution_ids", []), *record.get("spawned_execution_ids", [])} + ) + merged["updated_at"] = max(current.get("updated_at", 0), record.get("updated_at", 0)) + merged["cancellation_requested_by"] = current.get("cancellation_requested_by") or record.get( + "cancellation_requested_by" + ) + type(self).records[record["execution_id"]] = merged + return merged + + def record_execution_dependency(self, *, caller_execution_id: str, callee_execution_id: str, retries: int = 8): + del retries + type(self).execution_edges.append((caller_execution_id, callee_execution_id)) + + def list_execution_callers(self, callee_execution_id: str): + return [caller for caller, callee in type(self).execution_edges if callee == callee_execution_id] + + def delete_execution_dependency(self, *, caller_execution_id: str, callee_execution_id: str): + type(self).execution_edges = [ + edge for edge in type(self).execution_edges if edge != (caller_execution_id, callee_execution_id) + ] + + def create_cancellation_tombstone(self, *, execution_id: str, cache_key: str, requested_by: str, requested_at: int): + del execution_id, cache_key, requested_by, requested_at + return True + + +@contextmanager +def _opened_db(db=None): + yield db if db is not None else SimpleNamespace() + + +class TestIndexOps: + @given(_commit_strategy()) + @settings(max_examples=10) + def test_list(self, temp_bo, commit_obj): + """List returns existing refs; delete removes them.""" + ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + head_ops = HeadOps(_db=temp_bo._db) + with temp_bo._tx(readonly=False) as txn: + commit_ref = txn.put(commit_obj) + ref_id = head_ops.create_index(commit_ref) + try: + assert ref_id in head_ops.list_indexes() + finally: + ops.delete(ref_id) + assert ref_id not in head_ops.list_indexes() + + @pytest.mark.parametrize( + "builtin,args,expected", + [ + ("list", [1, 2], [1, 2]), + ("dict", ["a", 1, "b", 2], {"a": 1, "b": 2}), + ("get", [{"a": 1}, "a"], 1), + ("get", [{"a": 1}, "b", 9], 9), + ("contains", [{"a": 1, "b": 2}, "a"], True), + ("contains", [{"a": 1, "b": 2}, "c"], False), + ("contains", [[{"a": 1}, {"b": 2}], {"a": 1}], True), + ("contains", [[{"a": 1}, {"b": 2}], {"a": 2}], False), + ("assoc", [{"a": 1, "b": 2}, "c", 3], {"a": 1, "b": 2, "c": 3}), + ("assoc", [{"a": 1, "b": 2}, "a", 9], {"a": 9, "b": 2}), + ("conj", [[1, 2], 3], [1, 2, 3]), + ("unnest", [[[1], [2, 3], [4, [5]]]], [1, 2, 3, 4, [5]]), + ], + ) + def test_start_fn_builtins(self, temp_bo, builtin, args, expected): + ops, _head_ref, index = _mk_repo_state(temp_bo) + try: + before_indexes = set(HeadOps(_db=temp_bo._db).list_indexes()) + fn_node = _put_runnable_literal(ops, index, uri=f"daggerml:{builtin}", adapter="") + arg_nodes = [ops.put_literal(index, arg) for arg in args] + result = ops.start_fn(index, [fn_node, *arg_nodes]) + assert result is not None + nv = NodeOps(_db=temp_bo._db).unroll(result) + assert nv == expected + assert set(HeadOps(_db=temp_bo._db).list_indexes()) == before_indexes + finally: + ops.delete(index) + + def test_failed_execution_helper_does_not_publish_temp_index_ref(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://failed", adapter="dummy-adapter") + arg_node = ops.put_literal(index_ref, 1) + with ops._tx(readonly=False) as txn: + argv_ref = ops._prepare_fn(index_ref, [fn_node, arg_node], {}, txn) + + before_indexes = set(HeadOps(_db=temp_bo._db).list_indexes()) + dag_ref = ops._build_failed_execution_dag(argv_ref, "boom") + + with ops._tx(readonly=True) as txn: + dag = txn.get(dag_ref) + assert dag.error is not None + assert set(HeadOps(_db=temp_bo._db).list_indexes()) == before_indexes + finally: + ops.delete(index_ref) + + @pytest.mark.slow + @pytest.mark.parametrize( + "contract_id,stage,args,prepop,name", + [ + ("adapter-path-sum", "unnamed", [1.0, 2.0, 3.0], None, None), + ("adapter-path-sum", "named", [1.0, 2.0, 3.0], None, "result"), + ("adapter-path-prepop", "kwarg-override", [1.0, 2.0, 3.0], 2.0, "result"), + ], + ids=[ + "adapter-path-sum:unnamed", + "adapter-path-sum:named", + "adapter-path-prepop:kwarg-override", + ], + ) + def test_start_fn_adapter_execution_matrix(self, temp_bo, contract_id, stage, args, prepop, name, s3): + del contract_id, stage + temp_bo._db.clear_all() + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + try: + if prepop is None: + fn_node = _put_runnable_literal(ops, index, uri=SUM_FN_URI, adapter=FN_ADAPTER) + expected = float(sum(args)) + kwargv = None + else: + x_default = ops.put_literal(index, 1.0) + fn_node = ops.put_literal( + index, + Runnable(target=Uri(PREPOP_FN_URI), kwargs={"x": x_default}, adapter=FN_ADAPTER), + ) + prepop_node = ops.put_literal(index, prepop) + kwargv = {"x": prepop_node} + expected = float(sum(args) * prepop) + arg_nodes = [ops.put_literal(index, arg) for arg in args] + result = ops.start_fn(index, [fn_node, *arg_nodes], kwargv=kwargv, name=name) + assert result is not None + nv = NodeOps(_db=temp_bo._db).unroll(result) + assert nv == pytest.approx(expected) + finally: + ops.delete(index) + + def test_put_literal_dict_fn(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + n0 = ops.put_literal(index_ref, 0, name="v0") + ops.put_literal(index_ref, {"a": n0}, name="v1") + nops = NodeOps(_db=temp_bo._db) + with ops._tx(readonly=True) as txn: + dag: Dag = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(index_ref)).dag + vals = [nops.unroll(v) for v in dag.nodes] + vals = [str(v) if isinstance(v, dict) else v for v in vals] + vals = [x for x in vals if not isinstance(x, (Uri, Runnable))] + assert {0, "a", "{'a': 0}"} == set(vals) + + def test_put_literal_list_fn(self, temp_bo): + temp_bo._db.clear_all() + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + n0 = ops.put_literal(index_ref, 0, name="v0") + ops.put_literal(index_ref, [1, n0], name="v1") + nops = NodeOps(_db=temp_bo._db) + with ops._tx(readonly=True) as txn: + dag: Dag = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(index_ref)).dag + vals = [nops.unroll(v) for v in dag.nodes] + vals = [tuple(v) if isinstance(v, list) else v for v in vals] + vals = [x for x in vals if not isinstance(x, (Uri, Runnable))] + assert {0, 1, (1, 0)} == set(vals) + + def test_put_literal_runnable_fn_when_attrs_are_nodes(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + default_value_node = ops.put_literal(index_ref, 10) + runnable_node = ops.put_literal( + index_ref, + Runnable(target=Uri("daggerml:get"), kwargs={"x": default_value_node}, adapter=""), + name="rf", + ) + with ops._tx(readonly=True) as txn: + node = txn.get(runnable_node) + datum = txn.get(node.datum_ref(txn)) + assert isinstance(datum, RunnableDatum) + assert datum.target.ns() == "datum-uri" + assert datum.kwargs.ns() == "datum-dict" + + def test_start_fn_sum_err(self, temp_bo, s3): + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + args = [1, 2, 3, "BOGUS", 5] + try: + fn_node = _put_runnable_literal(ops, index, uri=SUM_FN_URI, adapter=FN_ADAPTER) + node_args = [fn_node, *[ops.put_literal(index, arg) for arg in args]] + with pytest.raises(Error, match="Argument at index 3 is str, expected int or float"): + ops.start_fn(index, node_args) + finally: + ops.delete(index) + + @pytest.mark.slow + @given( + args=st.lists( + st.one_of( + st.integers(min_value=-(2**63), max_value=2**63 - 1), + st.floats(allow_nan=False, allow_infinity=False), + ), + max_size=6, + ) + ) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=2, deadline=None) + def test_start_fn_delayed_sum_adapter(self, temp_bo, args, s3): + total = float(sum(args)) + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + try: + with patch.dict(__import__("os").environ, DML_TMP_DIR=ops._db.path): + completion_flag = Path(ops._db.path) / "completion.flag" + if completion_flag.exists(): + completion_flag.unlink() + fn_node = _put_runnable_literal(ops, index, uri=DELAYED_FN_URI, adapter=FN_ADAPTER) + arg_nodes = [ops.put_literal(index, arg) for arg in args] + # First call returns None (job not done yet) + result = ops.start_fn(index, [fn_node, *arg_nodes], name="result") + assert result is None + # Second call returns the result + result = ops.start_fn(index, [fn_node, *arg_nodes], name="result") + assert result is not None + nv = NodeOps(_db=temp_bo._db).unroll(result) + assert nv == pytest.approx(total) + finally: + ops.delete(index) + + @pytest.mark.slow + def test_start_fn_adapter_nonzero_exit_raises(self, temp_bo, tmp_path, s3): + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + failing_adapter = tmp_path / "adapter-fail.py" + failing_adapter.write_text( + ("import sys\nsys.stderr.write('boom from adapter\\n')\nraise SystemExit(1)\n"), + encoding="utf-8", + ) + os.chmod(failing_adapter, 0o755) + try: + fn_node = _put_runnable_literal(ops, index, uri="noop://fn", adapter=str(failing_adapter)) + with pytest.raises(DmlRepoError, match=r"Adapter call failed: .*boom from adapter"): + ops.start_fn(index, [fn_node], name="result") + finally: + ops.delete(index) + + @pytest.mark.slow + def test_start_fn_adapter_invalid_json_raises(self, temp_bo, tmp_path, s3): + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + bad_json_adapter = tmp_path / "adapter-bad-json.py" + bad_json_adapter.write_text( + ("print('not-json')\nraise SystemExit(0)\n"), + encoding="utf-8", + ) + os.chmod(bad_json_adapter, 0o755) + try: + fn_node = _put_runnable_literal(ops, index, uri="noop://fn", adapter=str(bad_json_adapter)) + with pytest.raises(DmlRepoError, match="Adapter output must be JSON"): + ops.start_fn(index, [fn_node], name="result") + finally: + ops.delete(index) + + def test_prepare_adapter_call_accepts_concrete_command_without_dml_prefix(self, temp_bo, monkeypatch): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://fn", adapter="podman-adapter") + with ops._tx(readonly=False) as txn: + argv_ref = ops._prepare_fn(index_ref, [fn_node], {}, txn) + + monkeypatch.setattr("daggerml._internal.ops.index.shutil.which", lambda name: "/usr/bin/podman-adapter") + + with ops._tx(readonly=True) as txn: + prepared = ops._prepare_adapter_call(index_ref, argv_ref, txn) + + assert prepared.adapter_path == "/usr/bin/podman-adapter" + assert prepared.runnable["adapter"] == "podman-adapter" + finally: + ops.delete(index_ref) + + def test_prepare_adapter_call_rejects_missing_concrete_adapter_command(self, temp_bo, monkeypatch): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://fn", adapter="missing-adapter") + with ops._tx(readonly=False) as txn: + argv_ref = ops._prepare_fn(index_ref, [fn_node], {}, txn) + + monkeypatch.setattr("daggerml._internal.ops.index.shutil.which", lambda name: None) + + with ops._tx(readonly=True) as txn: + with pytest.raises(DmlRepoError, match="No such adapter: missing-adapter"): + ops._prepare_adapter_call(index_ref, argv_ref, txn) + finally: + ops.delete(index_ref) + + def test_start_fn_rejects_empty_adapter_for_non_builtin_execution(self, temp_bo, monkeypatch): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://fn", adapter="") + monkeypatch.setattr("daggerml._internal.ops.index.shutil.which", lambda name: None) + + with pytest.raises(DmlRepoError, match="Invalid builtin URI scheme: noop"): + ops.start_fn(index_ref, [fn_node]) + finally: + ops.delete(index_ref) + + def test_start_fn_runnable_sub_cycle_raises(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + with ops._tx(readonly=False) as txn: + uri_ref = txn.put(Uri("noop://fn"), to=_gen_ref("datum-uri")) + kwargs_ref = txn.put(DictDatum(data={}), to=_gen_ref("datum-dict")) + runnable_ref = _gen_ref("datum-runnable") + txn.put( + RunnableDatum(target=uri_ref, sub=runnable_ref, kwargs=kwargs_ref, adapter="nonempty"), + to=runnable_ref, + ) + fn_node_ref = ops._put_node(LiteralNode(value=runnable_ref), txn=txn, index_id=index_ref) + with pytest.raises(DmlRepoError, match="Runnable sub cycle detected"): + ops.start_fn(index_ref, [fn_node_ref], name="result") + finally: + ops.delete(index_ref) + + def test_start_fn_caching(self, temp_bo, s3): + # ensure clean DB for this test to prevent map growth from prior tests + temp_bo._db.clear_all() + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + try: + fn_node = _put_runnable_literal(ops, index, uri=RAND_FN_URI, adapter=FN_ADAPTER) + # First call generates a random UUID + result1 = ops.start_fn(index, [fn_node], name="result1") + assert result1 is not None + nv1 = NodeOps(_db=temp_bo._db).unroll(result1) + # Second call with same args should return cached result + result2 = ops.start_fn(index, [fn_node], name="result2") + assert result2 is not None + nv2 = NodeOps(_db=temp_bo._db).unroll(result2) + assert nv1 == nv2 + assert isinstance(nv1, str) + assert len(nv1) == 36 + assert nv1.count("-") == 4 + finally: + ops.delete(index) + + def test_start_fn_cache_hit_returns_without_touching_execution_state(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://cached", adapter="dummy-adapter") + hit_dag_ref = Ref(f"dag:{'a' * 64}") + sentinel = object() + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: hit_dag_ref) + monkeypatch.setattr(IndexOps, "_finish_fn_result", lambda self, dag_ref, argv, name, txn, index_ref: sentinel) + monkeypatch.setattr(IndexOps, "_call_adapter", lambda *args, **kwargs: pytest.fail("adapter should not run")) + + try: + assert ops.start_fn(index_ref, [fn_node]) is sentinel + assert _FakeExecutionState.lock_calls == [] + finally: + ops.delete(index_ref) + + def test_start_fn_cache_miss_seeds_state_and_calls_adapter_for_running(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://pending", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'b' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-running", + runnable={"target": "noop://pending", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + calls: list[tuple[_PreparedAdapterCall, str]] = [] + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: calls.append((prepared_arg, argv_ptr)) + or {"status": "running", "error": None, "state": {"token": kwargs["execution_id"]}}, + ) + monkeypatch.setattr( + IndexOps, "_publish_terminal_state", lambda *args, **kwargs: pytest.fail("publish should not run") + ) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + assert calls == [(prepared, "argv-ptr")] + assert len(_FakeExecutionState.lock_calls) == 1 + record = next(iter(_FakeExecutionState.records.values())) + assert _FakeExecutionState.active[prepared.cache_key] == record["execution_id"] + assert _FakeExecutionState.launch_states[record["execution_id"]]["resume_state"] == { + "token": record["execution_id"] + } + assert record["spawned_execution_ids"] == [] + finally: + ops.delete(index_ref) + + def test_start_fn_succeeded_state_runs_cleanup_publishes_cache_and_marks_done(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://success", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'c' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-success", + runnable={"target": "noop://success", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + hit_dag_ref = Ref(f"dag:{'d' * 64}") + sentinel = object() + adapter_calls: list[tuple[_PreparedAdapterCall, str]] = [] + cache_hits = iter([None, None, hit_dag_ref]) + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: next(cache_hits)) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: adapter_calls.append((prepared_arg, argv_ptr)) + or {"status": "succeeded", "error": None, "dag_id": "e" * 64}, + ) + monkeypatch.setattr(IndexOps, "_finish_fn_result", lambda self, dag_ref, argv, name, txn, index_ref: sentinel) + + try: + assert ops.start_fn(index_ref, [fn_node]) is sentinel + assert len(adapter_calls) == 1 + finally: + ops.delete(index_ref) + + def test_start_fn_resume_uses_existing_immutable_state(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://resume", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'9' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-resume", + runnable={"target": "noop://resume", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + _FakeExecutionState.reset() + _FakeExecutionState.active[prepared.cache_key] = f"{prepared.cache_key}-4" + _FakeExecutionState.launch_states[f"{prepared.cache_key}-4"] = { + "execution_id": f"{prepared.cache_key}-4", + "cache_key": prepared.cache_key, + "resume_state": {"token": "original"}, + "created_at": 1, + } + _FakeExecutionState.records[f"{prepared.cache_key}-4"] = { + "cache_key": prepared.cache_key, + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + "execution_id": f"{prepared.cache_key}-4", + } + seen: list[dict[str, Any] | None] = [] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: seen.append(kwargs["state"]) + or {"status": "running", "error": None, "state": {"token": "replacement"}}, + ) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + assert seen == [{"token": "original"}] + assert _FakeExecutionState.launch_states[f"{prepared.cache_key}-4"]["resume_state"] == {"token": "original"} + assert _FakeExecutionState.active[prepared.cache_key] == f"{prepared.cache_key}-4" + finally: + ops.delete(index_ref) + + def test_start_fn_records_user_dag_call_edge_on_new_execution(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://lineage", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'7' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-lineage", + runnable={"target": "noop://lineage", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + caller_execution_id=index_ref, + caller_cache_key=index_ref, + ) + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: {"status": "running", "error": None, "state": {}}, + ) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + created = next(value for key, value in _FakeExecutionState.records.items() if key != index_ref) + assert _FakeExecutionState.execution_edges == [(index_ref, created["execution_id"])] + assert _FakeExecutionState.records[index_ref]["spawned_execution_ids"] == [created["execution_id"]] + finally: + ops.delete(index_ref) + + def test_cancel_marks_root_cancel_requested_and_discovers_full_graph(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.launch_states["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "resume_state": {"token": "live"}, + "created_at": 1, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "cancel-pending", + "updated_at": 1, + "spawned_execution_ids": ["exec-live"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-live")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr( + IndexOps, + "_invoke_cancel_update", + lambda self, execution_id, record: {"status": "cancel-detached", "error": None}, + ) + + cancelled_dir = Path(temp_bo._db.path).resolve().parent / "refs" / "local" / "indexes" / ".cancelled" + + result = ops.cancel(index_ref, requested_by="alice@example.com") + + assert result["index_id"] == index_ref + assert result["requested_by"] == "alice@example.com" + assert result["cancelled_path"] == cancelled_dir / index_ref + assert result["graph"] == {(index_ref, "exec-live")} + assert result["candidate_set"] == {"exec-live"} + assert result["own_executions"] == {"exec-live"} + assert _FakeExecutionState.records[index_ref]["lifecycle"] == "cancel-pending" + with pytest.raises(DmlRepoError, match="Pointer does not exist"): + HeadOps(_db=temp_bo._db).get_index_commit(index_ref) + assert (cancelled_dir / index_ref).exists() + + def test_runtime_cancel_leaves_cancelled_index_pointer_when_cancellation_fails(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + dml = Dml(project_home="/repo", remote_root=_remote_root_from_env(), user="alice@example.com") + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.launch_states["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "resume_state": {"token": "live"}, + "created_at": 1, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "cancel-pending", + "updated_at": 1, + "spawned_execution_ids": ["exec-live"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-live")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr( + IndexOps, + "_invoke_cancel_update", + lambda self, execution_id, record: (_ for _ in ()).throw(DmlRepoError("boom")), + ) + cancelled_dir = Path(temp_bo._db.path).resolve().parent / "refs" / "local" / "indexes" / ".cancelled" + + with ( + patch.object(dml_module, "with_db", side_effect=lambda _dml: _opened_db()), + patch.object(dml_module, "make_index_ops", return_value=ops), + pytest.raises(DmlRepoError, match=r"exceeded retry limit for execution exec-live: boom"), + ): + dml.runtime.cancel(index_ref) + + assert (cancelled_dir / index_ref).exists() + assert f".cancelled/{index_ref}" in HeadOps(_db=temp_bo._db).list_indexes() + assert _FakeExecutionState.records[index_ref]["lifecycle"] == "cancel-pending" + with pytest.raises(DmlRepoError, match="Pointer does not exist"): + HeadOps(_db=temp_bo._db).get_index_commit(index_ref) + + def test_cancel_retries_existing_cancelled_index(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "cancel-pending", + "updated_at": 1, + "spawned_execution_ids": ["exec-live"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-live")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + cancelled_dir = Path(temp_bo._db.path).resolve().parent / "refs" / "local" / "indexes" / ".cancelled" + + first = ops.cancel(index_ref, requested_by="alice@example.com") + + cancelled_path = cancelled_dir / index_ref + assert cancelled_path == cancelled_dir / index_ref + assert cancelled_path.exists() + assert f".cancelled/{index_ref}" in HeadOps(_db=temp_bo._db).list_indexes() + + result = ops.cancel(index_ref, requested_by="alice@example.com") + + assert first["cancelled_path"] == cancelled_path + assert result["cancelled_path"] == cancelled_path + assert result["candidate_set"] == {"exec-live"} + assert _FakeExecutionState.records[index_ref]["lifecycle"] == "cancel-pending" + assert cancelled_path.exists() + + def test_cancel_candidate_uses_cache_key_lock_and_reports_retry(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": ["exec-live"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-live")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr( + _FakeExecutionState, + "lock", + lambda self: type(self).lock_calls.append(self.cache_key) or False, + ) + + result = ops._cancel_execution_candidate( + "exec-live", + requested_by="alice@example.com", + own_executions={"exec-live"}, + ) + + assert result == { + "execution_id": "exec-live", + "outcome": None, + "lock_retry": True, + "cancel_requested": False, + } + assert _FakeExecutionState.lock_calls == ["cache-live"] + + def test_cancel_candidate_invokes_cancel_update_after_unlock(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.launch_states["exec-live"] = { + "execution_id": "exec-live", + "cache_key": "cache-live", + "resume_state": {"token": "live"}, + "created_at": 1, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "cancel-pending", + "updated_at": 1, + "spawned_execution_ids": ["exec-live"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-live")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + + seen = {} + + def _invoke_after_unlock(self, execution_id, record): + del self, record + seen["execution_id"] = execution_id + assert _FakeExecutionState.unlock_calls[-1] == "cache-live" + return {"status": "cancel-detached", "error": None} + + monkeypatch.setattr(IndexOps, "_invoke_cancel_update", _invoke_after_unlock) + + result = ops._cancel_execution_candidate( + "exec-live", + requested_by="alice@example.com", + own_executions={"exec-live"}, + ) + + assert seen["execution_id"] == "exec-live" + assert result["outcome"] == 1 + + def test_start_fn_unlocks_when_cancellation_interrupts_after_adapter(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://pending", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'b' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-running", + runnable={"target": "noop://pending", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: {"status": "running", "error": None, "state": {}}, + ) + + original_update = _FakeExecutionState.update_execution_record + + def _interrupt_update(self, record, *, retries=8): + if record["execution_id"] != index_ref: + raise CancelledExecutionError("cancelled") + return original_update(self, record, retries=retries) + + monkeypatch.setattr(_FakeExecutionState, "update_execution_record", _interrupt_update) + + with pytest.raises(CancelledExecutionError, match="cancelled"): + ops.start_fn(index_ref, [fn_node]) + assert _FakeExecutionState.unlock_calls == [prepared.cache_key] + + def test_cancel_candidate_uses_global_callers_for_ownership(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records["exec-shared"] = { + "execution_id": "exec-shared", + "cache_key": "cache-shared", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "cancel-pending", + "updated_at": 1, + "spawned_execution_ids": ["exec-shared"], + "cancellation_requested_by": "alice@example.com", + } + _FakeExecutionState.records["other-root"] = { + "execution_id": "other-root", + "cache_key": "other-root", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": ["exec-shared"], + "cancellation_requested_by": None, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-shared"), ("other-root", "exec-shared")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + + result = ops._cancel_execution_candidate( + "exec-shared", + requested_by="alice@example.com", + own_executions={"exec-shared"}, + ) + + assert result["outcome"] == -1 + assert result["cancel_requested"] is False + assert _FakeExecutionState.records["exec-shared"]["lifecycle"] == "running" + + def test_cancelled_execution_still_contributes_descendants_but_skips_adapter_work(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + _FakeExecutionState.reset() + _FakeExecutionState.records[index_ref] = { + "execution_id": index_ref, + "cache_key": index_ref, + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": ["exec-parent"], + "cancellation_requested_by": None, + } + _FakeExecutionState.records["exec-parent"] = { + "execution_id": "exec-parent", + "cache_key": "cache-parent", + "lifecycle": "cancel-detached", + "updated_at": 1, + "spawned_execution_ids": ["exec-child"], + "cancellation_requested_by": "alice@example.com", + } + _FakeExecutionState.launch_states["exec-parent"] = { + "execution_id": "exec-parent", + "cache_key": "cache-parent", + "resume_state": {"token": "parent"}, + "created_at": 1, + } + _FakeExecutionState.records["exec-child"] = { + "execution_id": "exec-child", + "cache_key": "cache-child", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + _FakeExecutionState.launch_states["exec-child"] = { + "execution_id": "exec-child", + "cache_key": "cache-child", + "resume_state": {"token": "child"}, + "created_at": 1, + } + _FakeExecutionState.execution_edges = [(index_ref, "exec-parent"), ("exec-parent", "exec-child")] + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr( + IndexOps, + "_invoke_cancel_update", + lambda self, execution_id, record: (_ for _ in ()).throw(AssertionError("adapter should not run")), + ) + + plan = ops.cancel(index_ref, requested_by="alice@example.com") + result = ops._cancel_execution_candidate( + "exec-parent", + requested_by="alice@example.com", + own_executions=set(cast(set[str], plan["own_executions"])), + ) + + assert plan["graph"] == {(index_ref, "exec-parent"), ("exec-parent", "exec-child")} + assert result["outcome"] == 1 + assert result["cancel_requested"] is False + + def test_freeze_index_for_cancellation_lists_cancelled_index(self, temp_bo): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + + cancelled_path = ops._freeze_index_for_cancellation(index_ref) + + try: + assert cancelled_path.parent.name == ".cancelled" + assert f".cancelled/{index_ref}" in HeadOps(_db=temp_bo._db).list_indexes() + finally: + ops.delete(f".cancelled/{index_ref}") + + def test_delete_removes_cancelled_index_marker(self, temp_bo): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + + cancelled_path = ops._freeze_index_for_cancellation(index_ref) + + assert cancelled_path.exists() + + ops.delete(f".cancelled/{index_ref}") + + assert not cancelled_path.exists() + + def test_start_fn_discards_stale_active_pointer_before_relaunch(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://stale-active", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'6' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-stale-active", + runnable={"target": "noop://stale-active", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + _FakeExecutionState.reset() + _FakeExecutionState.active[prepared.cache_key] = "stale-execution" + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: {"status": "running", "error": None, "state": {}}, + ) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + assert _FakeExecutionState.active[prepared.cache_key] != "stale-execution" + finally: + ops.delete(index_ref) + + def test_start_fn_records_fn_dag_call_edge_on_new_execution(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo, with_argv=True) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://lineage-fn", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'8' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-lineage-fn", + runnable={"target": "noop://lineage-fn", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + caller_execution_id="caller-exec-1", + caller_cache_key="caller-cache", + ) + _FakeExecutionState.reset() + _FakeExecutionState.records["caller-exec-1"] = { + "execution_id": "caller-exec-1", + "cache_key": "caller-cache", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: {"status": "running", "error": None, "state": {}}, + ) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + created = next(value for key, value in _FakeExecutionState.records.items() if key != "caller-exec-1") + assert _FakeExecutionState.execution_edges == [("caller-exec-1", created["execution_id"])] + assert _FakeExecutionState.records["caller-exec-1"]["spawned_execution_ids"] == [created["execution_id"]] + finally: + ops.delete(index_ref) + + def test_start_fn_failed_state_runs_cleanup_marks_done_and_raises(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://failed", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'e' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-failed", + runnable={"target": "noop://failed", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + adapter_calls: list[tuple[_PreparedAdapterCall, str]] = [] + published: list[tuple[Ref, str]] = [] + cache_reads = {"count": 0} + _FakeExecutionState.reset() + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr( + "daggerml._internal.ops.cache.CacheOps._get", + lambda self, argv_ref, txn: None + if (cache_reads.__setitem__("count", cache_reads["count"] + 1) or cache_reads["count"]) <= 2 + else Ref(f"dag:{'f' * 64}"), + ) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr( + IndexOps, + "_call_adapter", + lambda self, prepared_arg, argv_ptr, **kwargs: adapter_calls.append((prepared_arg, argv_ptr)) + or {"status": "failed", "error": "boom"}, + ) + monkeypatch.setattr( + IndexOps, + "_publish_terminal_state", + lambda self, argv_ref, state, execution_id: published.append((argv_ref, state["status"], execution_id)), + ) + monkeypatch.setattr( + IndexOps, + "_finish_fn_result", + lambda self, dag_ref, argv, name, txn, index_ref: (_ for _ in ()).throw(DmlRepoError("boom")), + ) + + try: + with pytest.raises(DmlRepoError, match="boom"): + ops.start_fn(index_ref, [fn_node]) + assert len(adapter_calls) == 1 + assert published == [(prepared.argv_ref, "failed", next(iter(_FakeExecutionState.records)))] + finally: + ops.delete(index_ref) + + def test_start_fn_done_state_returns_none_without_adapter_call(self, temp_bo, monkeypatch): + ops = _mk_remote_index_ops(temp_bo) + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + fn_node = _put_runnable_literal(ops, index_ref, uri="noop://done", adapter="dummy-adapter") + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'f' * 64}"), + adapter_path="dummy-adapter", + cache_key="cache-key-done", + runnable={"target": "noop://done", "adapter": "dummy-adapter", "kwargs": {}, "sub": None}, + ) + # Lock returns False — another caller holds the lock + _FakeExecutionState.reset() + fake_es = _FakeExecutionState.__new__(_FakeExecutionState) + fake_es.cache_key = prepared.cache_key + original_lock = _FakeExecutionState.lock + + def lock_returns_false(self): + return False + + monkeypatch.setattr(_FakeExecutionState, "lock", lock_returns_false) + monkeypatch.setattr("daggerml._internal.ops.index.ExecutionState", _FakeExecutionState) + monkeypatch.setattr("daggerml._internal.ops.cache.CacheOps._get", lambda self, argv_ref, txn: None) + monkeypatch.setattr(IndexOps, "_prepare_adapter_call", lambda self, index_ref_arg, argv_ref, txn: prepared) + monkeypatch.setattr( + IndexOps, "_remote_ops", lambda self: SimpleNamespace(put_ref_manifest=lambda argv_ref: "argv-ptr") + ) + monkeypatch.setattr(IndexOps, "_call_adapter", lambda *args, **kwargs: pytest.fail("adapter should not run")) + + try: + assert ops.start_fn(index_ref, [fn_node]) is None + finally: + monkeypatch.setattr(_FakeExecutionState, "lock", original_lock) + ops.delete(index_ref) + + def test_start_fn_cache_key_includes_adapter(self, temp_bo, tmp_path, s3): + # This should fail on the current bug: cache key does not include adapter identity. + import sys as _sys + + temp_bo._db.clear_all() + _ops, _head_ref, index = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + try: + alt_adapter = tmp_path / "python-fork-adapter-alt.sh" + alt_adapter.write_text( + f'#!/usr/bin/env bash\nset -eu\nexec "{_sys.executable}" "{FN_ADAPTER}"\n', + encoding="utf-8", + ) + os.chmod(alt_adapter, 0o755) + + fn_node_path_adapter = _put_runnable_literal(ops, index, uri=RAND_FN_URI, adapter=str(alt_adapter)) + fn_node_default_adapter = _put_runnable_literal(ops, index, uri=RAND_FN_URI, adapter=FN_ADAPTER) + + result1 = ops.start_fn(index, [fn_node_path_adapter], name="result_path_adapter") + result2 = ops.start_fn(index, [fn_node_default_adapter], name="result_default_adapter") + assert result1 is not None + assert result2 is not None + nv1 = NodeOps(_db=temp_bo._db).unroll(result1) + nv2 = NodeOps(_db=temp_bo._db).unroll(result2) + + assert nv1 != nv2 + assert isinstance(nv1, str) + assert isinstance(nv2, str) + assert len(nv1) == 36 + assert nv1.count("-") == 4 + assert len(nv2) == 36 + assert nv2.count("-") == 4 + finally: + ops.delete(index) + + @given(value=scalar_strategy(), name=_NAME_STRAT) + @settings(max_examples=10) + def test_put_literal(self, temp_bo, value, name): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + node_ref = ops.put_literal(index_ref, value, name=name) + with ops._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(index_ref)) + assert node_ref in ctx.dag.nodes + assert ctx.dag.names[name] == node_ref + node = txn.get(node_ref) + assert isinstance(node, LiteralNode) + assert _unroll_datum(txn, node.value) == value + finally: + ops.delete(index_ref) + + def test_literal_codecs_traverse_codec_returned_collection(self, temp_bo): + class SeedCodec: + def can_encode(self, value): + return value == "seed" + + def encode(self, value, dag): + return [1, 2] + + class IntCodec: + def can_encode(self, value): + return isinstance(value, int) and value < 10 + + def encode(self, value, dag): + return value + 10 + + old_codecs = literal_codec._literal_codecs.copy() + old_seq = literal_codec._literal_codec_seq + old_plugins_loaded = literal_codec._plugins_loaded + try: + literal_codec._literal_codecs = [] + literal_codec._literal_codec_seq = 0 + literal_codec._plugins_loaded = True + literal_codec.register_codec(SeedCodec(), priority=10) + literal_codec.register_codec(IntCodec(), priority=0) + with temporary_dml() as dml: + dag = new(dml=dml, name="codec-collection", message="codec-collection") + assert dag.put("seed").value() == [11, 12] + finally: + literal_codec._literal_codecs = old_codecs + literal_codec._literal_codec_seq = old_seq + literal_codec._plugins_loaded = old_plugins_loaded + + def test_literal_codecs_traverse_codec_returned_runnable(self, temp_bo): + class SeedCodec: + def can_encode(self, value): + return value == "seed" + + def encode(self, value, dag): + return Runnable(target=Uri("daggerml:list"), adapter="", kwargs={"x": 1}, sub=None) + + class IntCodec: + def can_encode(self, value): + return isinstance(value, int) and value == 1 + + def encode(self, value, dag): + return value + 1 + + old_codecs = literal_codec._literal_codecs.copy() + old_seq = literal_codec._literal_codec_seq + old_plugins_loaded = literal_codec._plugins_loaded + try: + literal_codec._literal_codecs = [] + literal_codec._literal_codec_seq = 0 + literal_codec._plugins_loaded = True + literal_codec.register_codec(SeedCodec(), priority=10) + literal_codec.register_codec(IntCodec(), priority=0) + with temporary_dml() as dml: + dag = new(dml=dml, name="codec-runnable", message="codec-runnable") + encoded = dag.put("seed").value() + assert isinstance(encoded, Runnable) + assert encoded.target.uri == "daggerml:list" + assert encoded.kwargs["x"] == 2 + finally: + literal_codec._literal_codecs = old_codecs + literal_codec._literal_codec_seq = old_seq + literal_codec._plugins_loaded = old_plugins_loaded + + def test_start_fn_applies_codec_to_argv_and_kwargv(self, temp_bo): + with temporary_dml() as dml: + dag0 = new(dml=dml, name="codec-call", message="codec-call") + fn = Runnable(target=Uri("daggerml:list"), adapter="", kwargs={"x": 0}, sub=None) + result_ref = dag0.put(42).ref + seen: list[Any] = [] + + def _spy_apply(value, *, dag): + assert dag is dag0 + seen.append(value) + return value + + original_start_fn = dag0._start_fn + dag0._start_fn = lambda argv, *, kwargv=None, name=None: result_ref + try: + with patch("daggerml.codecs.apply_codec", side_effect=_spy_apply): + out = dag0.call(fn, 1, x=2) + finally: + dag0._start_fn = original_start_fn + assert out.value() == 42 + assert fn in seen + assert 1 in seen + assert 2 in seen + + def test_get_argv_raises_when_missing(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo, with_argv=False) + try: + with pytest.raises(DmlRepoError, match="DAG has no argv node"): + ops.get_argv(index_ref) + finally: + ops.delete(index_ref) + + def test_get_argv_returns_node_when_present(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo, with_argv=True) + try: + argv_node_ref = ops.get_argv(index_ref) + with ops._tx(readonly=True) as txn: + node = txn.get(argv_node_ref) + assert isinstance(node, ArgvNode) + finally: + ops.delete(index_ref) + + def test_get_kwargv_returns_node_when_present(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo, with_argv=True) + try: + kwargv_node_ref = ops.get_kwargv(index_ref) + with ops._tx(readonly=True) as txn: + node = txn.get(kwargv_node_ref) + assert isinstance(node, KwargvNode) + finally: + ops.delete(index_ref) + + @pytest.mark.slow + def test_start_fn_sub_runnable_forwards_and_resolves_kwargs(self, temp_bo, tmp_path, s3): + temp_bo._db.clear_all() + _ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + ops = _mk_remote_index_ops(temp_bo) + outer_log = tmp_path / "outer-runnable.json" + inner_log = tmp_path / "inner-runnable.json" + inner_adapter = tmp_path / "inner-adapter.py" + outer_adapter = tmp_path / "outer-adapter.py" + + inner_adapter.write_text( + ( + "import json\n" + "import subprocess\n" + "import sys\n" + "from urllib.parse import urlparse\n" + f"LOG_PATH = {str(inner_log)!r}\n" + "raw = sys.stdin.read()\n" + "payload = json.loads(raw)\n" + "with open(LOG_PATH, 'w', encoding='utf-8') as fh:\n" + " json.dump(payload.get('runnable', {}), fh, sort_keys=True)\n" + "target = payload.get('runnable', {}).get('target', '')\n" + "script = urlparse(target).path\n" + "completed = subprocess.run(\n" + " [sys.executable, script],\n" + " input=raw,\n" + " text=True,\n" + " capture_output=True,\n" + " check=False,\n" + ")\n" + "sys.stdout.write(completed.stdout)\n" + "sys.stderr.write(completed.stderr)\n" + "raise SystemExit(completed.returncode)\n" + ), + encoding="utf-8", + ) + outer_adapter.write_text( + ( + "import json\n" + "import shutil\n" + "import subprocess\n" + "import sys\n" + f"LOG_PATH = {str(outer_log)!r}\n" + "raw = sys.stdin.read()\n" + "payload = json.loads(raw)\n" + "runnable = payload.get('runnable', {})\n" + "with open(LOG_PATH, 'w', encoding='utf-8') as fh:\n" + " json.dump(runnable, fh, sort_keys=True)\n" + "sub = runnable.get('sub')\n" + "if sub is None:\n" + " sys.stderr.write('missing sub runnable\\n')\n" + " raise SystemExit(1)\n" + "adapter = sub.get('adapter', '')\n" + "adapter_path = shutil.which(adapter) if '/' not in adapter else adapter\n" + "if adapter_path is None:\n" + " sys.stderr.write(f'No such adapter: {adapter}\\n')\n" + " raise SystemExit(1)\n" + "cmd = [adapter_path]\n" + "if adapter_path.endswith('.py'):\n" + " cmd = [sys.executable, adapter_path]\n" + "forwarded = {\n" + " 'argv_ptr': payload.get('argv_ptr'),\n" + " 'cache_key': payload.get('cache_key'),\n" + " 'execution_id': payload.get('execution_id'),\n" + " 'remote': payload.get('remote'),\n" + " 'runnable': sub,\n" + " 'state': payload.get('state'),\n" + " 'execution_status': payload.get('execution_status'),\n" + " 'cancel_requested_by': payload.get('cancel_requested_by'),\n" + "}\n" + "completed = subprocess.run(\n" + " cmd,\n" + " input=json.dumps(forwarded),\n" + " text=True,\n" + " capture_output=True,\n" + " check=False,\n" + ")\n" + "sys.stdout.write(completed.stdout)\n" + "sys.stderr.write(completed.stderr)\n" + "raise SystemExit(completed.returncode)\n" + ), + encoding="utf-8", + ) + os.chmod(inner_adapter, 0o755) + os.chmod(outer_adapter, 0o755) + + try: + fn_node = ops.put_literal( + index_ref, + Runnable( + target=Uri("wrapper://outer"), + kwargs={"y": 1.0, "shared": 40.0}, + adapter=str(outer_adapter), + sub=Runnable( + target=Uri(PREPOP_FN_URI), + kwargs={"x": 2.0, "shared": 20.0}, + adapter=str(inner_adapter), + ), + ), + ) + arg_nodes = [ops.put_literal(index_ref, x) for x in [1.0, 2.0, 3.0]] + result_ref = ops.start_fn( + index_ref, + [fn_node, *arg_nodes], + kwargv={ + "x": ops.put_literal(index_ref, 10.0), + "y": ops.put_literal(index_ref, 30.0), + "shared": ops.put_literal(index_ref, 99.0), + }, + name="result", + ) + assert result_ref is not None + assert NodeOps(_db=temp_bo._db).unroll(result_ref) == pytest.approx(60.0) + + outer_runnable = json.loads(outer_log.read_text(encoding="utf-8")) + inner_runnable = json.loads(inner_log.read_text(encoding="utf-8")) + + assert outer_runnable["target"] == "wrapper://outer" + assert outer_runnable["kwargs"]["y"] == 30.0 + assert outer_runnable["kwargs"]["shared"] == 40.0 + assert outer_runnable["sub"]["target"] == PREPOP_FN_URI + assert outer_runnable["sub"]["kwargs"]["x"] == 10.0 + assert outer_runnable["sub"]["kwargs"]["shared"] == 99.0 + + assert inner_runnable["target"] == PREPOP_FN_URI + assert inner_runnable["kwargs"]["x"] == 10.0 + assert inner_runnable["kwargs"]["shared"] == 99.0 + finally: + ops.delete(index_ref) + + def test_call_adapter_envelope_includes_execution_status(self, temp_bo, monkeypatch): + ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + prepared = _PreparedAdapterCall( + argv_ref=Ref(f"node-argv:{'1' * 64}"), + adapter_path="capture-adapter", + cache_key="f" * 64, + runnable={"target": "wrapper://capture", "adapter": "capture-adapter", "kwargs": {}, "sub": None}, + ) + seen: dict[str, object] = {} + + def _fake_run(cmd, *, input, capture_output, text): + seen["cmd"] = cmd + seen["payload"] = json.loads(input) + return SimpleNamespace(returncode=0, stdout='{"status":"running","error":null,"state":{}}', stderr="") + + monkeypatch.setattr("daggerml._internal.ops.index.run", _fake_run) + + result = ops._call_adapter( + prepared, + "a" * 64, + execution_id="exec-1", + state=None, + execution_status="cancel-pending", + cancel_requested_by="alice@example.com", + ) + assert result == {"status": "running", "error": None, "state": {}} + payload = seen["payload"] + assert isinstance(payload, dict) + assert payload["cache_key"] == prepared.cache_key + assert payload["argv_ptr"] == "a" * 64 + assert payload["execution_id"] == "exec-1" + assert payload["state"] is None + assert payload["execution_status"] == "cancel-pending" + assert payload["cancel_requested_by"] == "alice@example.com" + assert payload["remote"] == {"root": _remote_root_from_env()} + assert "comms" not in payload + + def test_get_node_returns_named_node(self, temp_bo): + """get_node returns the ref of a node that was stored with a name.""" + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + node_ref = ops.put_literal(index_ref, 42, name="my_node") + retrieved_ref = ops.get_node(index_ref, "my_node") + assert retrieved_ref == node_ref + finally: + ops.delete(index_ref) + + def test_describe_returns_current_index_state(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + node_ref = ops.put_literal(index_ref, 42, name="answer") + info = ops.describe(index_ref) + assert info["dag"].ns() == "dag" + assert node_ref in info["nodes"] + assert info["names"]["answer"] == node_ref + finally: + ops.delete(index_ref) + + def test_set_node_name_updates_name_map(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + node_ref = ops.put_literal(index_ref, 42) + ops.set_node_name(index_ref, "answer", node_ref) + assert ops.get_node(index_ref, "answer") == node_ref + finally: + ops.delete(index_ref) + + def test_put_literal_retries_from_pointer_conflict_current_commit(self, temp_bo, monkeypatch): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + original = HeadOps.update_index_commit + calls = {"count": 0} + + def _conflict_once(self, idx, old_commit, new_commit, txn=None): + del txn + calls["count"] += 1 + if calls["count"] == 1: + current_commit = original(self, idx, old_commit, new_commit) + raise DmlPointerConflictError("stale", current_commit=current_commit) + return original(self, idx, old_commit, new_commit) + + monkeypatch.setattr(HeadOps, "update_index_commit", _conflict_once) + + try: + node_ref = ops.put_literal(index_ref, 42, name="answer") + assert calls["count"] >= 2 + assert ops.get_node(index_ref, "answer") == node_ref + finally: + ops.delete(index_ref) + + def test_commit_retries_branch_publication_from_pointer_conflict_current_commit(self, temp_bo, monkeypatch): + ops, head_ref, index_ref = _mk_repo_state(temp_bo) + original = HeadOps.update_branch_commit + calls = {"count": 0} + + def _conflict_once(self, branch_name, old_commit, new_commit, txn=None): + del txn + calls["count"] += 1 + if calls["count"] == 1: + current_commit = original(self, branch_name, old_commit, new_commit) + raise DmlPointerConflictError("stale", current_commit=current_commit) + return original(self, branch_name, old_commit, new_commit) + + monkeypatch.setattr(HeadOps, "update_branch_commit", _conflict_once) + + try: + node_ref = ops.put_literal(index_ref, 42, name="answer") + commit_ref = ops.commit(index_ref, node_ref, head=head_ref, message="done") + assert calls["count"] >= 2 + assert HeadOps(_db=temp_bo._db).get_branch_commit(head_ref) == commit_ref + finally: + if index_ref in HeadOps(_db=temp_bo._db).list_indexes(): + ops.delete(index_ref) + + def test_get_node_raises_when_name_not_found(self, temp_bo): + """get_node raises DmlRepoError when the name doesn't exist in the DAG.""" + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + with pytest.raises(DmlRepoError, match="Node 'nonexistent' not found in DAG"): + ops.get_node(index_ref, "nonexistent") + finally: + ops.delete(index_ref) + + @given(value=scalar_strategy(), name=_NAME_STRAT) + @settings(max_examples=10) + def test_get_node_roundtrip(self, temp_bo, value, name): + """get_node returns the same ref that was stored via put_literal with a name.""" + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + node_ref = ops.put_literal(index_ref, value, name=name) + retrieved_ref = ops.get_node(index_ref, name) + assert retrieved_ref == node_ref + # Verify the node contains the expected value + with ops._tx(readonly=True) as txn: + node = txn.get(retrieved_ref) + assert isinstance(node, LiteralNode) + assert _unroll_datum(txn, node.value) == value + finally: + ops.delete(index_ref) + + def test_get_node_with_multiple_named_nodes(self, temp_bo): + """get_node correctly retrieves each named node when multiple exist.""" + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + ref_a = ops.put_literal(index_ref, "value_a", name="node_a") + ref_b = ops.put_literal(index_ref, "value_b", name="node_b") + ref_c = ops.put_literal(index_ref, "value_c", name="node_c") + + assert ops.get_node(index_ref, "node_a") == ref_a + assert ops.get_node(index_ref, "node_b") == ref_b + assert ops.get_node(index_ref, "node_c") == ref_c + finally: + ops.delete(index_ref) + + def test_put_import_incomplete_dag_errors(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + with ops._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(index_ref)) + with pytest.raises(DmlRepoError, match="Cannot import from a DAG with no result node"): + ops.put_import(index_ref, cast(Ref, ctx.commit.dag)) + finally: + ops.delete(index_ref) + + def test_put_import_imports_other_dag_result(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + try: + other_dag_ref = _gen_ref("dag") + other_node_ref = _gen_ref("node", "literal") + other_datum_ref = _gen_ref("datum-scalar") + with ops._tx(readonly=False) as txn: + txn.put(ScalarDatum(data=123), to=other_datum_ref) + txn.put(LiteralNode(value=other_datum_ref), to=other_node_ref) + txn.put(Dag(nodes=[other_node_ref], names={}, result=other_node_ref, argv=None), to=other_dag_ref) + + imported_ref = ops.put_import(index_ref, other_dag_ref, name="imported") + with ops._tx(readonly=True) as txn: + node = txn.get(imported_ref) + assert isinstance(node, ImportNode) + assert node.dag == other_dag_ref + assert node.node == other_node_ref + ctx = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(index_ref)) + assert imported_ref in ctx.dag.nodes + finally: + ops.delete(index_ref) + + def test_start_fn_requires_runnable_first_arg(self, temp_bo): + ops, _head_ref, index_ref = _mk_repo_state(temp_bo) + non_runnable = ops.put_literal(index_ref, 123) + with pytest.raises(DmlRepoError, match="First arg must resolve to a Runnable datum"): + ops.start_fn(index_ref, [non_runnable]) + + def test_create_argv_ptr_rejects_non_argv_root(self, temp_bo, s3): + from daggerml._internal.ops.remote import RemoteOps + + ops, head_ref, index_ref = _mk_repo_state(temp_bo) + remote_index_ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + try: + literal_node = ops.put_literal(index_ref, 123) + bucket, _prefix = remote_bucket_and_prefix_from_env() + prefix = _remote_protocol_prefix_from_env() + remote_ops = RemoteOps(_db=temp_bo._db, client=s3, bucket=bucket, prefix=prefix) + bad_ptr = remote_ops.put_ref_manifest(literal_node) + with pytest.raises(DmlRepoError, match="Manifest root namespace mismatch"): + remote_index_ops.create(argv_ptr=bad_ptr) + finally: + ops.delete(index_ref) + HeadOps(_db=temp_bo._db).delete_branch(head_ref) + + def test_create_validates_input_mode(self, temp_bo): + ops, head_ref, index_ref = _mk_repo_state(temp_bo) + try: + with pytest.raises(DmlRepoError, match="Provide exactly one of branch, commit, or argv_ptr."): + ops.create() + with pytest.raises(DmlRepoError, match="Provide exactly one of branch, commit, or argv_ptr."): + ops.create(head=head_ref, argv_ptr="a" * 64) + finally: + ops.delete(index_ref) + HeadOps(_db=temp_bo._db).delete_branch(head_ref) + + def test_create_argv_ptr_loads_remote_argv(self, temp_bo, s3): + from daggerml._internal.ops.remote import RemoteOps + + ops, head_ref, index_ref = _mk_repo_state(temp_bo) + created_index = None + try: + fn_node = _put_runnable_literal(ops, index_ref, uri="daggerml:list", adapter="") + arg_node = ops.put_literal(index_ref, 42) + with ops._tx(readonly=False) as txn: + argv_ref = ops._prepare_fn(index_ref, [fn_node, arg_node], {}, txn) + + bucket, _prefix = remote_bucket_and_prefix_from_env() + prefix = _remote_protocol_prefix_from_env() + remote_ops = RemoteOps(_db=temp_bo._db, client=s3, bucket=bucket, prefix=prefix) + argv_ptr = remote_ops.put_ref_manifest(argv_ref) + + remote_index_ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + created_index = remote_index_ops.create(argv_ptr=argv_ptr) + + with remote_index_ops._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(HeadOps(_db=temp_bo._db).get_index_commit(created_index)) + assert ctx.dag is not None + assert ctx.dag.argv == argv_ref + kwargv_ref = remote_index_ops._kwargv_ref_from_nodes(ctx.dag, txn) + assert kwargv_ref is not None + assert kwargv_ref.ns() == "node-kwargv" + finally: + if created_index is not None: + ops.delete(created_index) + ops.delete(index_ref) + HeadOps(_db=temp_bo._db).delete_branch(head_ref) + + @given(value=scalar_strategy(), dag_name=_NAME_STRAT) + @settings(max_examples=10) + def test_commit_deletes_index_and_updates_head(self, temp_bo, value, dag_name): + ops, head_ref, index_ref = _mk_repo_state(temp_bo) + with ops._tx(readonly=True) as txn: + before = HeadOps(_db=temp_bo._db).get_branch_commit(head_ref) + node_ref = ops.put_literal(index_ref, value, name="result") + commit_ref = ops.commit(index_ref, node_ref, message="done", dag_name=dag_name, head=head_ref) + + with ops._tx(readonly=True) as txn: + assert index_ref not in HeadOps(_db=temp_bo._db).list_indexes() + assert HeadOps(_db=temp_bo._db).get_branch_commit(head_ref) == commit_ref + assert HeadOps(_db=temp_bo._db).get_branch_commit(head_ref) != before + + commit_obj = txn.get(commit_ref) + assert isinstance(commit_obj, Commit) + assert commit_obj.message == "done" + tree_obj = txn.get(commit_obj.tree) + assert isinstance(tree_obj, Tree) + assert dag_name in tree_obj.dags diff --git a/tests/contracts/internal/ops/test_node_ops_contract.py b/tests/contracts/internal/ops/test_node_ops_contract.py new file mode 100644 index 0000000..cfe2db9 --- /dev/null +++ b/tests/contracts/internal/ops/test_node_ops_contract.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import pytest +from hypothesis import given, settings +from hypothesis import strategies as st + +from daggerml._internal._db import Ref +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import ( + Dag, + DictDatum, + DmlRepoError, + FnNode, + ImportNode, + ListDatum, + LiteralNode, + ScalarDatum, +) +from tests.contracts.internal.support.test_db_support import _gen_ref, scalar_strategy + + +def _raw_object_strategy(): + return st.recursive( + scalar_strategy(), + lambda children: st.one_of( + st.lists(children, max_size=4), + st.dictionaries(st.text(max_size=16), children, max_size=4), + ), + max_leaves=16, + ) + + +def _contains_ref(x) -> bool: + if isinstance(x, Ref): + return True + if isinstance(x, list): + return any(_contains_ref(v) for v in x) + if isinstance(x, dict): + return any(_contains_ref(v) for v in x.values()) + return False + + +def _put_datum_tree(ops, value) -> Ref: + with ops._tx(readonly=False) as txn: + + def put_internal(v): + if isinstance(v, list): + return txn.put(ListDatum(data=[put_internal(x) for x in v])) + if isinstance(v, dict): + return txn.put(DictDatum(data={k: put_internal(vv) for k, vv in v.items()})) + return txn.put(ScalarDatum(data=v)) + + return put_internal(value) + + +class TestNodeOps: + @given(obj=_raw_object_strategy()) + @settings(max_examples=25) + def test_unroll_roundtrip(self, temp_bo, obj): + root_datum = _put_datum_tree(temp_bo, obj) + with temp_bo._tx(readonly=False) as txn: + node_ref = txn.put(LiteralNode(value=root_datum)) + ops = NodeOps(_db=temp_bo._db) + assert ops.unroll(node_ref) == obj + assert _contains_ref(ops.unroll(node_ref)) is False + + @given( + obj=st.one_of( + st.lists(scalar_strategy(), max_size=4), + st.dictionaries(st.text(max_size=16), scalar_strategy(), max_size=4), + ) + ) + @settings(max_examples=25) + def test_get_is_one_layer_deep(self, temp_bo, obj): + root_datum = _put_datum_tree(temp_bo, obj) + with temp_bo._tx(readonly=False) as txn: + node_ref = txn.put(LiteralNode(value=root_datum)) + ops = NodeOps(_db=temp_bo._db) + got = ops.get(node_ref) + assert isinstance(got, type(obj)) + if isinstance(got, list): + assert all(isinstance(x, Ref) and x.nss()[0] == "datum" for x in got) + else: + assert all(isinstance(v, Ref) and v.nss()[0] == "datum" for v in got.values()) + + def test_import_node_unroll(self, temp_bo): + with temp_bo._tx(readonly=False) as txn: + inner_datum = txn.put(ScalarDatum(data=123)) + inner_node = txn.put(LiteralNode(value=inner_datum)) + import_node_ref = txn.put(ImportNode(dag=_gen_ref("dag"), node=inner_node)) + ops = NodeOps(_db=temp_bo._db) + assert ops.get(import_node_ref) == 123 + assert ops.unroll(import_node_ref) == 123 + info = ops.describe(import_node_ref) + assert info["type"] == "ImportNode" + assert info["dag"].ns() == "dag" + + def test_describe_fn_node(self, temp_bo): + with temp_bo._tx(readonly=False) as txn: + datum_ref = txn.put(ScalarDatum(data=1)) + lit_ref = txn.put(LiteralNode(value=datum_ref)) + dag_ref = txn.put(Dag(nodes=[lit_ref], names={}, result=lit_ref)) + fn_ref = txn.put(FnNode(argv=[lit_ref], dag=dag_ref)) + ops = NodeOps(_db=temp_bo._db) + info = ops.describe(fn_ref) + assert info["type"] == "FnNode" + assert info["dag"] == dag_ref + assert info["argv"] == [lit_ref] + + def test_requires_node_ref(self, temp_bo): + ops = NodeOps(_db=temp_bo._db) + with pytest.raises(DmlRepoError, match="Expected node ref"): + ops.get(_gen_ref("datum-scalar")) diff --git a/tests/contracts/internal/ops/test_revision_parsing_contract_matrix.py b/tests/contracts/internal/ops/test_revision_parsing_contract_matrix.py new file mode 100644 index 0000000..98e64f4 --- /dev/null +++ b/tests/contracts/internal/ops/test_revision_parsing_contract_matrix.py @@ -0,0 +1,309 @@ +from pathlib import Path + +import pytest + +from daggerml._internal._db import Ref +from daggerml._internal.config import DmlProjectConfig, init_project_layout +from daggerml._internal.dml_resolution import resolve_dag_ref, resolve_node_ref, resolve_revision +from daggerml._internal.ops.commit import CommitOps +from daggerml._internal.ops.dag import DagOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.remote import RemoteOps +from daggerml._internal.types import Commit, DmlRepoError, Tree + + +@pytest.mark.parametrize( + "contract_id,label,uri,expected_path", + [ + ( + "revision-uri-canonicalization", + "branch-uri", + "dml://alice/demo#main", + "projects/alice/demo/heads/main.json", + ), + ( + "revision-uri-canonicalization", + "tag-uri", + "dml://alice/demo@v1.0", + "projects/alice/demo/tags/v1.0.json", + ), + ], + ids=[ + "revision-uri-canonicalization:branch-uri", + "revision-uri-canonicalization:tag-uri", + ], +) +def test_uri_canonicalization_matrix(remote_ops, contract_id, label, uri, expected_path): + del contract_id, label + assert RemoteOps.canonical_dml_uri(uri, require_identifier=True) == uri + assert remote_ops._dml_uri_ref_path(uri) == expected_path + + +def test_uri_canonicalization_rejects_oversized_identifier(remote_ops): + del remote_ops + with pytest.raises(ValueError, match="64-byte"): + RemoteOps.canonical_dml_uri("dml://alice/" + "x" * 80 + "#main", require_identifier=True) + + +def _seed_project_commit_history(temp_bo_fn, tmp_path: Path) -> tuple[CommitOps, Ref, Ref]: + project = DmlProjectConfig(name="demo", owner="alice", remote_root="s3://bucket/prefix") + init_project_layout(tmp_path, project) + head_ops = HeadOps(_db=temp_bo_fn._db) + commit_ops = CommitOps(_db=temp_bo_fn._db) + + main_head = head_ops.create_branch("main") + head_ops.write_attached_head("main") + initial = head_ops.get_branch_commit(main_head) + with commit_ops._tx(readonly=False) as txn: + tree = txn.get(txn.get(initial).tree) + next_tree = txn.put(Tree(dags=dict(tree.dags))) + next_commit = txn.put(Commit(parents=[initial], tree=next_tree, author="alice", message="next")) + head_ops.update_branch_commit(main_head, initial, next_commit) + head_ops.create_branch("dml://alice/demo@v1_0", initial) + + return commit_ops, initial, next_commit + + +def _seed_named_dags(temp_bo_fn, tmp_path: Path, dag_nodes: dict[str, str]): + project = DmlProjectConfig(name="demo", owner="alice", remote_root="s3://bucket/prefix") + init_project_layout(tmp_path, project) + head_ops = HeadOps(_db=temp_bo_fn._db) + commit_ops = CommitOps(_db=temp_bo_fn._db) + dag_ops = DagOps(_db=temp_bo_fn._db) + index_ops = IndexOps(_db=temp_bo_fn._db, remote_root="") + + main_head = head_ops.create_branch("main") + head_ops.write_attached_head("main") + + node_refs: dict[str, Ref] = {} + for dag_name, node_name in dag_nodes.items(): + index_id = index_ops.create(head=main_head) + node_ref = index_ops.put_literal(index_id, dag_name, name=node_name) + index_ops.commit(index_id, node_ref, head=main_head, message=f"add {dag_name}", dag_name=dag_name) + node_refs[dag_name] = node_ref + + latest_commit = head_ops.get_branch_commit(main_head) + dag_refs = {dag_name: commit_ops.get_dag(latest_commit, dag_name) for dag_name in dag_nodes} + return commit_ops, head_ops, dag_ops, latest_commit, dag_refs, node_refs + + +@pytest.mark.parametrize( + "contract_id,label,revision_builder,expected_kind,expected_commit", + [ + ( + "revision-form-classification", + "branch", + lambda initial, next_commit: "main", + "branch", + lambda initial, next_commit: next_commit, + ), + ( + "revision-form-classification", + "tag", + lambda initial, next_commit: "v1_0", + "tag", + lambda initial, next_commit: initial, + ), + ( + "revision-form-classification", + "ancestry-expression", + lambda initial, next_commit: "HEAD~1", + "commit", + lambda initial, next_commit: initial, + ), + ( + "revision-form-classification", + "direct-commit-id", + lambda initial, next_commit: initial.id(), + "commit", + lambda initial, next_commit: initial, + ), + ( + "revision-form-classification", + "explicit-commit-ref", + lambda initial, next_commit: f"commit:{initial.id()}", + "commit", + lambda initial, next_commit: initial, + ), + ], + ids=[ + "revision-form-classification:branch", + "revision-form-classification:tag", + "revision-form-classification:ancestry-expression", + "revision-form-classification:direct-commit-id", + "revision-form-classification:explicit-commit-ref", + ], +) +def test_revision_form_classification_matrix( + temp_bo_fn, + tmp_path: Path, + contract_id, + label, + revision_builder, + expected_kind, + expected_commit, +): + del contract_id, label + commit_ops, initial, next_commit = _seed_project_commit_history(temp_bo_fn, tmp_path) + revision = revision_builder(initial, next_commit) + resolved = resolve_revision( + value=revision, + commit_ops=commit_ops, + head_ops=HeadOps(_db=temp_bo_fn._db), + project_dir=str(tmp_path), + ) + assert resolved.kind == expected_kind + assert resolved.commit == expected_commit(initial, next_commit) + + +def test_revision_rejects_unfetched_remote_root_boundary(temp_bo_fn, tmp_path: Path): + commit_ops, _initial, _next_commit = _seed_project_commit_history(temp_bo_fn, tmp_path) + with pytest.raises(DmlRepoError, match="cannot be resolved locally"): + resolve_revision( + value="dml://alice/demo#main", + commit_ops=commit_ops, + head_ops=HeadOps(_db=temp_bo_fn._db), + project_dir=str(tmp_path), + ) + + +def test_detached_head_ancestry_resolves_from_head_file(temp_bo_fn, tmp_path: Path): + commit_ops, initial, next_commit = _seed_project_commit_history(temp_bo_fn, tmp_path) + HeadOps(_db=temp_bo_fn._db).write_detached_head(next_commit) + + resolved = resolve_revision( + value="HEAD~1", + commit_ops=commit_ops, + head_ops=HeadOps(_db=temp_bo_fn._db), + project_dir=str(tmp_path), + ) + + assert resolved.kind == "commit" + assert resolved.commit == initial + + +def test_dag_resolution_returns_named_dag_ref(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, _dag_ops, latest_commit, dag_refs, _node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result"} + ) + + resolved = resolve_dag_ref( + value="train", + revision="HEAD", + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="get", + ) + + assert resolved.ref == dag_refs["train"] + assert resolved.value == "train" + assert resolved.revision is not None + assert resolved.revision.commit == latest_commit + + +def test_dag_resolution_requires_name_selector_input(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, _dag_ops, _latest_commit, dag_refs, _node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result"} + ) + + with pytest.raises(DmlRepoError, match="DAG name is required"): + resolve_dag_ref( + value=dag_refs["train"], + revision="HEAD", + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="get", + ) + + +def test_node_resolution_resolves_named_node_with_explicit_dag(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, dag_ops, _latest_commit, _dag_refs, node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result"} + ) + + resolved = resolve_node_ref( + value="result", + dag="train", + revision="HEAD", + commit_ops=commit_ops, + dag_ops=dag_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="describe-node", + ) + + assert resolved.ref == node_refs["train"] + assert resolved.dag == "train" + assert resolved.revision is not None + + +def test_node_resolution_resolves_named_node_without_dag_when_unique(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, dag_ops, _latest_commit, _dag_refs, node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result", "score": "score_result"} + ) + + resolved = resolve_node_ref( + value="score_result", + commit_ops=commit_ops, + dag_ops=dag_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="get-node", + ) + + assert resolved.ref == node_refs["score"] + assert resolved.dag == "score" + assert resolved.revision is not None + + +def test_node_resolution_rejects_ambiguous_named_lookup_without_dag(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, dag_ops, _latest_commit, _dag_refs, _node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result", "score": "result"} + ) + + with pytest.raises(DmlRepoError, match="requires dag for ambiguous node lookup"): + resolve_node_ref( + value="result", + commit_ops=commit_ops, + dag_ops=dag_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="describe-node", + ) + + +def test_dag_resolution_rejects_ref_like_dag_string_selector(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, _dag_ops, _latest_commit, _dag_refs, _node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result"} + ) + + with pytest.raises(DmlRepoError, match="Expected dag Ref"): + resolve_dag_ref( + value="dag:abc123", + revision=None, + commit_ops=commit_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="get", + ) + + +def test_node_resolution_rejects_ref_like_node_string_selector(temp_bo_fn, tmp_path: Path): + commit_ops, head_ops, dag_ops, _latest_commit, _dag_refs, _node_refs = _seed_named_dags( + temp_bo_fn, tmp_path, {"train": "result"} + ) + + with pytest.raises(DmlRepoError, match="Expected node Ref"): + resolve_node_ref( + value="node-literal:abc123", + dag="train", + revision="HEAD", + commit_ops=commit_ops, + dag_ops=dag_ops, + head_ops=head_ops, + project_dir=str(tmp_path), + operation="get-node", + ) diff --git a/tests/contracts/internal/support/conftest_support.py b/tests/contracts/internal/support/conftest_support.py new file mode 100644 index 0000000..5a90fa7 --- /dev/null +++ b/tests/contracts/internal/support/conftest_support.py @@ -0,0 +1,295 @@ +"""Common test fixtures for dml-util tests.""" + +import base64 +import hashlib +import os +import shutil +import tempfile +from contextlib import contextmanager +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict +from unittest.mock import patch + +import pytest + +from daggerml._internal._db import DmlDbEnv, DmlDbMapFullError, Ref +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.types import NAMESPACES + + +def split_remote_root(remote_root: str) -> tuple[str, str]: + """Parse `s3://bucket/prefix` into (bucket, project_prefix).""" + if not remote_root.startswith("s3://"): + raise ValueError(f"Invalid remote root URI: {remote_root!r}") + rest = remote_root[5:] + if not rest: + raise ValueError(f"Invalid remote root URI: {remote_root!r}") + if "/" not in rest: + return rest, "" + bucket, prefix = rest.split("/", 1) + return bucket, prefix.strip("/") + + +def remote_bucket_and_prefix_from_env() -> tuple[str, str]: + return split_remote_root(os.environ["DML_REMOTE_ROOT"]) + + +def remote_protocol_prefix_from_env() -> str: + """Return protocol prefix rooted under `/dml`.""" + _bucket, project_prefix = remote_bucket_and_prefix_from_env() + return f"{project_prefix}/dml" if project_prefix else "dml" + + +@pytest.fixture(scope="module") +def _aws_server(): + """Module fixture providing a moto S3 server.""" + with patch.dict(os.environ): + # IMPORTANT: clear out env variables for safety **BEFORE** importing moto + for k in os.environ: + if k.startswith("AWS_"): + del os.environ[k] + from moto.server import ThreadedMotoServer + + server = ThreadedMotoServer(port=0, verbose=False) + server.start() + host, port = server.get_host_and_port() + yield { + "server": server, + "endpoint": f"http://{host}:{port}", + "port": port, + "envvars": { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_ENDPOINT_URL": f"http://{host}:{port}", + }, + } + server.stop() + + +@pytest.fixture(autouse=True) +def clear_envvars(): + """Autouse fixture to clear AWS/DML env vars and set test values.""" + with patch.dict(os.environ): + # Clear existing AWS and DML environment variables + for k in list(os.environ.keys()): + if k.startswith("AWS_") or k.startswith("DML_"): + del os.environ[k] + + # Set test-specific environment variables + os.environ["DML_REMOTE_ROOT"] = "s3://test-bucket/test-prefix" + os.environ["AWS_SHARED_CREDENTIALS_FILE"] = "/dev/null" + os.environ["PYTHONPATH"] = "." # ensure `tests` is in PYTHONPATH + yield + + +@pytest.fixture +def aws_server(_aws_server, clear_envvars): + """Fixture that sets up AWS environment and returns server info.""" + import boto3 + + # Set environment variables from _aws_server + os.environ.update(_aws_server["envvars"]) + # Call boto3.setup_default_session() after env vars are set + boto3.setup_default_session() + yield _aws_server + + +@pytest.fixture +def s3(aws_server): + """Fixture providing a boto3 S3 client and ensuring bucket exists.""" + import boto3 + + s3_client = boto3.client("s3", endpoint_url=aws_server["endpoint"]) + bucket, _prefix = remote_bucket_and_prefix_from_env() + try: + s3_client.create_bucket(Bucket=bucket) + except s3_client.exceptions.BucketAlreadyExists: + pass # Bucket already exists, which is fine + except s3_client.exceptions.BucketAlreadyOwnedByYou: + pass # Bucket already owned by us, which is fine + yield s3_client + + +@pytest.fixture +def db(): + """Fixture providing a FakeDb instance for testing.""" + return FakeDb() + + +@pytest.fixture +def remote_ops(db, s3): + """Fixture providing RemoteOps instance using plain boto3.client('s3').""" + from daggerml._internal.ops.remote import RemoteOps + + bucket, prefix = remote_bucket_and_prefix_from_env() + yield RemoteOps( + _db=db, + client=s3, + bucket=bucket, + prefix=prefix, + ) + + +@pytest.fixture(scope="class") +def integration_remote_ops(temp_bo, aws_server): + """Fixture providing RemoteOps instance with real database for integration tests.""" + import boto3 + + from daggerml._internal.ops.remote import RemoteOps + + # Create S3 client for integration tests + s3_client = boto3.client("s3", endpoint_url=aws_server["endpoint"]) + + # Ensure bucket exists + bucket, prefix = remote_bucket_and_prefix_from_env() + try: + s3_client.create_bucket(Bucket=bucket) + except s3_client.exceptions.BucketAlreadyExists: + pass # Bucket already exists, which is fine + except s3_client.exceptions.BucketAlreadyOwnedByYou: + pass # Bucket already owned by us, which is fine + + yield RemoteOps( + _db=temp_bo, + client=s3_client, + bucket=bucket, + prefix=prefix, + ) + + +@pytest.fixture +def temp_db_fn(): + """Function-scoped fixture providing a temporary DmlDbEnv for integration tests.""" + with tempfile.TemporaryDirectory() as temp_dir: + db_env = None + try: + db_path = Path(temp_dir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db_env = TmpEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + yield db_env + finally: + if db_env is not None: + db_env.clear_all() + db_env.close() + + +@pytest.fixture +def temp_bo_fn(temp_db_fn): + """Function-scoped fixture providing a BaseOps instance with a temporary database.""" + yield BaseOps(temp_db_fn) + + +@pytest.fixture +def integration_remote_ops_fn(temp_bo_fn, aws_server): + """Function-scoped fixture providing RemoteOps instance with real database for integration tests.""" + import boto3 + + from daggerml._internal.ops.remote import RemoteOps + + # Create S3 client for integration tests + s3_client = boto3.client("s3", endpoint_url=aws_server["endpoint"]) + + # Ensure bucket exists + bucket, prefix = remote_bucket_and_prefix_from_env() + try: + s3_client.create_bucket(Bucket=bucket) + except s3_client.exceptions.BucketAlreadyExists: + pass # Bucket already exists, which is fine + except s3_client.exceptions.BucketAlreadyOwnedByYou: + pass # Bucket already owned by us, which is fine + + yield RemoteOps( + _db=temp_bo_fn._db, + client=s3_client, + bucket=bucket, + prefix=prefix, + ) + + +@dataclass +class FakeTxn: + """Fake transaction implementation matching required interface.""" + + kv: Dict[str, Any] + readonly: bool + + def get(self, ref, raw=False): + """Get value by ref from fake storage.""" + return self.kv.get(ref.to) + + def exists(self, ref): + """Check whether a ref exists in fake storage.""" + return ref.to in self.kv + + def put(self, value, *, to=None, **kwargs): + """Put value at ref in fake storage.""" + if self.readonly: + raise ValueError("Cannot put in readonly transaction") + if to is None: + ns = kwargs.get("ns") + if ns is None: + raise ValueError("FakeTxn.put requires either to or ns") + if kwargs.get("raw"): + decoded = base64.b64decode(value) + to = Ref(f"{ns}:{hashlib.sha256(decoded).hexdigest()}") + else: + raise ValueError("FakeTxn.put without 'to' only supports raw=True") + self.kv[to.to] = value + return to + + +@dataclass +class FakeDb: + """Fake database implementation matching required interface.""" + + kv: Dict[str, Any] = field(default_factory=dict) + namespaces: list = field(default_factory=lambda: sorted(NAMESPACES)) + path: str = "/tmp/daggerml-fake/.dml/db" + + @contextmanager + def tx(self, readonly=False): + """Transaction context manager returning a raw fake transaction.""" + yield FakeTxn(self.kv, readonly) + + +@dataclass +class TmpEnv(DmlDbEnv): + def clear_all(self): + while True: + try: + with self.tx(readonly=False) as txn: + for ns in NAMESPACES: + for obj, _ in txn.iter(ns): + txn.delete(obj) + db_path = Path(self.path) + repo_root = db_path.parent.parent if db_path.name == "db" and db_path.parent.name == ".dml" else db_path + shutil.rmtree(repo_root / ".dml", ignore_errors=True) + db_path.mkdir(parents=True, exist_ok=True) + return + except DmlDbMapFullError: + self.resize(self.get_size() * 2) + + +@pytest.fixture(scope="class") +def temp_db(): + """Provides a temporary DmlDbEnv for testing.""" + with tempfile.TemporaryDirectory() as temp_dir: + db_env = None + try: + db_path = Path(temp_dir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db_env = TmpEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + yield db_env + finally: + if db_env is not None: + db_env.clear_all() + db_env.close() + + +@pytest.fixture(scope="class") +def temp_bo(temp_db): + """Provides a BaseOps instance with a temporary database.""" + yield BaseOps(temp_db) diff --git a/tests/contracts/internal/support/test_db_support.py b/tests/contracts/internal/support/test_db_support.py new file mode 100644 index 0000000..5b7de61 --- /dev/null +++ b/tests/contracts/internal/support/test_db_support.py @@ -0,0 +1,524 @@ +import hashlib +import os +import random +import threading +from base64 import b64decode, b64encode +from contextlib import contextmanager +from uuid import uuid4 + +import pytest +from hypothesis import HealthCheck, assume, given, settings +from hypothesis import strategies as st + +from daggerml._internal._db import ( + DmlDbEnv, + DmlDbError, + DmlDbForkedTxnError, + DmlDbInvalidPathError, + DmlDbKeyNotFoundError, + Ref, +) + +REF_ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789" +STR_ALPHABET = st.characters(blacklist_categories=("Cs", "Cc"), blacklist_characters="\x00") + + +def _refs(*ns: str, full: bool = False): + if not ns: + raise ValueError("at least one namespace must be provided") + # Use printable UTF-8 characters (STR_ALPHABET) when full is True; otherwise + # use a restricted alphanumeric REF_ALPHABET. Additionally restrict the + # UTF-8 encoded byte length to DML_REF_ID_MAX (64 bytes) since the C parser + # enforces a maximum id byte size. + text_strat = st.text( + alphabet=STR_ALPHABET if full else REF_ALPHABET, + min_size=1 if full else 32, + max_size=64, + ).filter(lambda s: len(s.encode("utf-8")) <= 64) + ns_ = "-".join(ns) + return st.builds(lambda ident: Ref(f"{ns_}:{ident}"), text_strat) + + +def _gen_ref(*ns: str) -> Ref: + """Generate a random ref for testing, avoiding Hypothesis .example() issues.""" + if not ns: + raise ValueError("at least one namespace must be provided") + ns_str = "-".join(ns) + # Generate a random identifier using REF_ALPHABET + ident = "".join(random.choice(REF_ALPHABET) for _ in range(32)) + return Ref(f"{ns_str}:{ident}") + + +def int_strategy(): + return st.integers(min_value=-(2**63), max_value=2**63 - 1) + + +def float_strategy(): + return st.floats(allow_nan=False, allow_infinity=False) + + +def scalar_strategy(recursive=False): + return st.one_of( + int_strategy(), + float_strategy(), + st.booleans(), + st.text(), + st.none(), + ) + + +def dml_object(): + return st.recursive( + scalar_strategy(recursive=True), + lambda children: st.one_of( + st.lists(children, max_size=4), + st.dictionaries(st.text(min_size=1, max_size=5), children, max_size=4), + ), + max_leaves=12, + ) + + +@contextmanager +def make_db(root, name, namespaces=("a", "b")): + path = root / name + path.mkdir() + env = DmlDbEnv.create(str(path), namespaces=namespaces) + try: + yield env + finally: + env.close() + + +@pytest.fixture +def db_env(tmp_path): + tmpdir = tmp_path / f"db_env_{uuid4().hex}" + tmpdir.mkdir() + env = DmlDbEnv.create(str(tmpdir), namespaces=["a", "b"]) + size = env.get_size() + assert size > 0 + new_size = size + 512 * 1024 + env.resize(new_size) + try: + yield env + finally: + env.close() + + +class TestDbEnv: + def test_create_invalid_path(self, tmp_path): + missing_path = tmp_path / "missing" / "repo" + with pytest.raises(DmlDbInvalidPathError): + DmlDbEnv.create(str(missing_path), namespaces=["a"]) + + def test_create_and_open(self, tmp_path): + db_path = tmp_path / "db_env" + db_path.mkdir() + env = DmlDbEnv.create(str(db_path), namespaces=["a", "b"]) + assert env is not None + assert env.path == str(db_path) + size = env.get_size() + assert size > 0 + new_size = size + 512 * 1024 + env.resize(new_size) + assert env.get_size() == new_size + with env.tx(readonly=False) as txn: + x = txn.put("hello", ns="a") + assert isinstance(x, Ref) + assert x.ns() == "a" + with env.tx(readonly=True) as txn: + assert txn.get(x) == "hello" + env.close() + + def test_open_requires_mapsize_for_large_db(self, tmp_path): + db_path = tmp_path / "db_env_large" + db_path.mkdir() + DmlDbEnv.create(str(db_path), namespaces=["a"]) + + @contextmanager + def _open(map_size=None): + db = DmlDbEnv.open(str(db_path), namespaces=["a"], map_size=map_size) + try: + yield db + finally: + db.close() + + # Use small strings within the 1MB limit + small_data = "x" * (100 * 1024) # 100KB + # Try to fill the default map without explicit map_size + # Expect failure when DB fills up + refs = [] + with pytest.raises((DmlDbError, RuntimeError)): + with _open() as env: + with env.tx(readonly=False) as txn: + for i in range(1000): # Cap iterations + ref = txn.put(f"{small_data}_{i}", ns="a") + refs.append(ref) + # Reopen with larger map_size and verify success + with _open(20 * 1024**2) as env: # 20MB map + with env.tx(readonly=False) as txn: + for i in range(100): + ref = txn.put(f"{small_data}_{i}", ns="a") + refs.append(ref) + # Verify data persists + with _open() as env: + with env.tx(readonly=True) as txn: + assert txn.get(refs[-1]).startswith(small_data[:50]) + + @given(dml_object(), _refs("a", full=True)) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=10) + def test_given_key_round_trip(self, db_env, data, to_ref): + with db_env.tx(readonly=False) as txn: + x = txn.put(data, to=to_ref) + assert isinstance(x, Ref) + assert x.ns() == "a" + with db_env.tx(readonly=True) as txn: + assert txn.get(x) == data + with db_env.tx(readonly=False) as txn: + txn.delete(x) + + @given(dml_object()) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=10) + def test_round_trip(self, db_env, data): + with db_env.tx(readonly=False) as txn: + x = txn.put(data, ns="a") + assert isinstance(x, Ref) + assert x.ns() == "a" + with db_env.tx(readonly=True) as txn: + assert txn.get(x) == data + + def test_overwrite(self, db_env): + with db_env.tx(readonly=False) as txn: + x = txn.put("hello", ns="a") + assert txn.get(x) == "hello" + y = txn.put("world", to=x) + assert x == y + with db_env.tx(readonly=True) as txn: + assert txn.get(x) == "world" + + def test_no_overwrite(self, db_env): + with db_env.tx(readonly=False) as txn: + x = txn.put("hello", ns="a") + assert txn.get(x) == "hello" + y = txn.put("world", to=x, no_overwrite=True) + assert x == y + with db_env.tx(readonly=True) as txn: + assert txn.get(x) == "hello" + + def test_delete(self, db_env): + with db_env.tx(readonly=False) as txn: + x = txn.put("hello", ns="a") + txn.delete(x) + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(x) + + def test_get_missing_key(self, db_env): + key = Ref("a:missing") + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(key) + + @given(st.binary()) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=10) + def test_put_raw_parameter(self, db_env, data): + """Test the raw parameter in the put method.""" + raw_bytes = b64encode(data).decode("utf-8") + with db_env.tx(readonly=False) as txn: + ref = txn.put(raw_bytes, ns="a", raw=True) + with db_env.tx(readonly=True) as txn: + raw_result = txn.get(ref, raw=True) + assert isinstance(raw_result, str) + assert raw_result == raw_bytes + + @given(dml_object()) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=10) + def test_raw_data_roundtrip_between_databases(self, tmp_path, data): + with make_db(tmp_path, f"db1_{uuid4().hex}", namespaces=["a"]) as db1: + with db1.tx(readonly=False) as txn: + ref1 = txn.put(data, ns="a") + # Retrieve the raw data + with db1.tx(readonly=True) as txn: + raw_data = txn.get(ref1, raw=True) + # Create second database and insert the raw data + with make_db(tmp_path, f"db2_{uuid4().hex}", namespaces=["a"]) as db2: + with db2.tx(readonly=False) as txn: + ref2 = txn.put(raw_data, ns="a", raw=True) + # Retrieve the raw data from the second database + with db2.tx(readonly=True) as txn: + new_data = txn.get(ref2) + # Verify the raw data is identical + assert new_data == data + + @given(dml_object()) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=10) + def test_ref_id_is_sha256sum_of_raw_data(self, db_env, data): + """ + Insert arbitrary data (including dicts, lists, refs, etc.) and get back `ref`. + Then ensure the `ref.id()` matches the SHA-256 hash of the raw data retrieved with `get(raw=True)`. + + Note: We should **not** need msgpack here, as we're only verifying the hash of the raw data string. + """ + with db_env.tx(readonly=False) as txn: + ref = txn.put(data, ns="a") + raw = txn.get(ref, raw=True) + sha256_hash = hashlib.sha256(b64decode(raw)).hexdigest() + assert ref.id() == sha256_hash + + def test_exists(self, db_env): + key = Ref("a:mykey") + with db_env.tx(readonly=True) as txn: + assert not txn.exists(key) + with db_env.tx(readonly=False) as txn: + x = txn.put("hello", to=key) + with db_env.tx(readonly=True) as txn: + assert txn.exists(x) + + def test_contextmanager(self, db_env): + with db_env.tx(readonly=False) as txn: + x = txn.put("x", ns="a") + assert txn.get(x) == "x" + with db_env.tx(readonly=True) as txn: + assert txn.get(x) == "x" + + def test_nested_transactions(self, db_env): + with db_env.tx(readonly=False) as txn0: + x0 = txn0.put(0, ns="a") + assert txn0.get(x0) == 0 + x1 = txn0.put(1, ns="a") + assert txn0.get(x1) == 1 + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(x1) # creates a new read transaction + # Note: nested transactions via txn0.tx() are not supported in the new API + # So we'll continue using the same transaction + x2 = txn0.put(2, ns="a") + assert txn0.get(x2) == 2 + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(x2) + assert txn0.get(x2) == 2 + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(x2) + with db_env.tx(readonly=True) as txn: + assert txn.get(x2) == 2 + assert txn.get(x1) == 1 + assert txn.get(x0) == 0 + + @pytest.mark.skipif(not hasattr(os, "fork"), reason="fork not available on this platform") + def test_fork_closes_inherited_txn(self, db_env): + with pytest.raises(RuntimeError, match="test raise"): + with db_env.tx(readonly=False) as txn: + pid = os.fork() + if pid == 0: + try: + txn.put("child-write", ns="a") + except DmlDbForkedTxnError: + os._exit(0) + os._exit(1) + pid, status = os.waitpid(pid, 0) + assert os.WIFEXITED(status) + assert os.WEXITSTATUS(status) == 0 + raise RuntimeError("test raise") + + def test_unreachable_objects(self, db_env): + with db_env.tx(readonly=False) as txn: + ref1 = txn.put("v1", ns="a") + ref2 = txn.put("v2", ns="a") + ref3 = txn.put({"child": ref1}, ns="a") + # list_orphans is a txn-level method + unreachable = txn.list_orphans(start=[ref3]) + assert ref2 in unreachable + assert ref1 not in unreachable + assert ref3 not in unreachable + + @given( + st.recursive( + scalar_strategy(recursive=False), + lambda children: st.lists(children, max_size=4) + | st.dictionaries(st.text(min_size=1, max_size=5), children, max_size=4), + max_leaves=20, + ) + ) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=50) + def test_deep_nested_round_trip(self, db_env, data): + """Store and retrieve deeply nested data structures to ensure C -> Python + deserialization properly transfers ownership and produces equivalent + Python objects (no use-after-free or partial copies). + """ + + def max_depth(o): + if isinstance(o, dict): + return 1 + max((max_depth(v) for v in o.values()), default=0) + if isinstance(o, (list, tuple)): + return 1 + max((max_depth(v) for v in o), default=0) + return 0 + + assume(max_depth(data) > 1) + with db_env.tx(readonly=False) as txn: + ref = txn.put(data, ns="a") + with db_env.tx(readonly=True) as txn: + got = txn.get(ref) + with db_env.tx(readonly=False) as txn: + txn.delete(ref) + assert got == data + if isinstance(data, dict): + assert set(got.keys()) == set(data.keys()) + + def test_close_idempotency(self, db_env): + # Note: nested transactions are not supported in the new API + # So we'll just test that a single transaction works + with db_env.tx(readonly=True) as _txn: + pass + + +class TestIter: + @pytest.mark.parametrize( + "start_token, expected_keys", + [ + (None, ["k1", "k2", "k3"]), + ("k2", ["k2", "k3"]), + ], + ) + def test_iter_items(self, db_env, start_token, expected_keys): + expected = {"k1": "v1", "k2": "v2", "k3": "v3"} + with db_env.tx(readonly=False) as txn: + for key, value in expected.items(): + txn.put(value, to=Ref(f"a:{key}")) + + # iter is a txn-level method, so we need a read transaction + with db_env.tx(readonly=True) as txn: + items = list(txn.iter("a", start_token=start_token)) + keys = [ref.id() for ref, _ in items] + values = {ref.id(): value for ref, value in items} + + assert keys == expected_keys + assert values == {key: expected[key] for key in expected_keys} + + def test_iter_transaction_visibility(self, db_env): + with db_env.tx(readonly=False) as txn_setup: + txn_setup.put("v0", to=Ref("a:k0")) + txn_setup.put("vb0", to=Ref("b:kb0")) + with db_env.tx(readonly=False) as txn: + txn.put("v1", to=Ref("a:k1")) + txn.put("vb1", to=Ref("b:kb1")) + txn_items = list(txn.iter("a")) + txn_keys = [ref.id() for ref, _ in txn_items] + txn_values = {ref.id(): value for ref, value in txn_items} + assert txn_keys == ["k0", "k1"] + assert txn_values == {"k0": "v0", "k1": "v1"} + # For root_items, we need a new read transaction + with db_env.tx(readonly=True) as root_txn: + root_items = list(root_txn.iter("a")) + root_keys = [ref.id() for ref, _ in root_items] + assert root_keys == ["k0"] + with db_env.tx(readonly=True) as after_txn: + after_items = list(after_txn.iter("a")) + after_keys = [ref.id() for ref, _ in after_items] + after_values = {ref.id(): value for ref, value in after_items} + + assert after_keys == ["k0", "k1"] + assert after_values == {"k0": "v0", "k1": "v1"} + + def test_empty_db(self, db_env): + # iter is a txn-level method, so we need a read transaction + with db_env.tx(readonly=True) as txn: + with pytest.raises(DmlDbKeyNotFoundError): + list(txn.iter("a")) + + +class TestIterationTruncation: + def test_null_chars(self, temp_db): + """ + Demonstrate that keys containing NUL bytes are truncated by the iterator + (strlen-based parsing) and that deleting the original (non-truncated) + Ref then fails with DmlDbKeyNotFoundError. + """ + with temp_db.tx(readonly=False) as txn: + # Use a real DB namespace ("head" is no longer a DB namespace). + original = Ref("commit:ab\x00cd") + txn.put({"x": 1}, to=original) + listed = [r for r, _ in txn.iter("commit")] + assert any(isinstance(r, Ref) for r in listed), "no refs yielded" + listed_strs = [r.to for r in listed] + # Fixed behavior: full key is preserved, deletion should succeed + assert "commit:ab\x00cd" in listed_strs + txn.delete(original) + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(original) + + @given(_refs("commit", full=True)) + def test_any_chars(self, temp_db, ref): + """ + Demonstrate that keys containing NUL bytes are truncated by the iterator + (strlen-based parsing) and that deleting the original (non-truncated) + Ref then fails with DmlDbKeyNotFoundError. + """ + with temp_db.tx(readonly=False) as txn: + txn.put({"x": 1}, to=ref) + listed = [r for r, _ in txn.iter("commit")] + assert any(isinstance(r, Ref) for r in listed), "no refs yielded" + # Historically the iterator used strlen and produced a truncated id + # 'head@ab'. New behavior preserves binary keys (NULs included). + # Fixed behavior: full key is preserved, deletion should succeed + assert ref in listed + txn.delete(ref) + with pytest.raises(DmlDbKeyNotFoundError): + txn.get(ref) + + +class TestRef: + def test_nss_simple(self): + ref = Ref("head:mainbranch") + assert ref.nss() == ["head"] + + def test_nss_hierarchical(self): + ref = Ref("node-argv:asdfqiowewf") + assert ref.nss() == ["node", "argv"] + + def test_nss_deep_hierarchy(self): + ref = Ref("a-b-c:id") + assert ref.nss() == ["a", "b", "c"] + + def test_nss_no_colon(self): + ref = Ref("datum:id") + assert ref.nss() == ["datum"] + + def test_nss_multiple_colons(self): + ref = Ref("node-fn-arg:id") + assert ref.nss() == ["node", "fn", "arg"] + + def test_threaded_puts(self, db_env): + refs = [] + + def worker(i): + with db_env.tx(readonly=False) as txn: + ref = txn.put({"v": i}, ns="a") + refs.append(ref) + + threads = [threading.Thread(target=worker, args=(i,)) for i in range(5)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + assert len(refs) == 5 + with db_env.tx(readonly=True) as txn: + values = sorted(txn.get(ref)["v"] for ref in refs) + assert values == list(range(5)) + + @pytest.mark.skipif(not hasattr(os, "fork"), reason="fork not available on this platform") + def test_fork_reopen(self, db_env): + with db_env.tx(readonly=False) as txn: + ref = txn.put({"v": "child"}, ns="a") + pid = os.fork() + if pid == 0: + try: + with db_env.tx(readonly=True) as txn: + assert txn.get(ref) == {"v": "child"} + except Exception: + os._exit(1) + os._exit(0) + pid, status = os.waitpid(pid, 0) + assert os.WIFEXITED(status) + assert os.WEXITSTATUS(status) == 0 diff --git a/tests/contracts/internal/support/util_support.py b/tests/contracts/internal/support/util_support.py new file mode 100644 index 0000000..7f9f39a --- /dev/null +++ b/tests/contracts/internal/support/util_support.py @@ -0,0 +1,4 @@ +from pathlib import Path + +# Directory containing test assets (fn/, etc.) +TEST_DIR = Path(__file__).resolve().parents[3] / "assets" / "internal_fn" diff --git a/tests/contracts/internal/test_builtins_contract.py b/tests/contracts/internal/test_builtins_contract.py new file mode 100644 index 0000000..14062e7 --- /dev/null +++ b/tests/contracts/internal/test_builtins_contract.py @@ -0,0 +1,82 @@ +"""Tests for builtins.py - built-in functions.""" + +import pytest + +from daggerml._internal.builtins import BUILTIN_FNS + + +class TestBuiltinFunctions: + """Test built-in function implementations.""" + + def test_builtin_fns_defined(self): + """Test that BUILTIN_FNS contains expected functions.""" + expected_functions = ["get", "contains", "list", "dict", "assoc", "conj", "unnest"] + for func_name in expected_functions: + assert func_name in BUILTIN_FNS + + @pytest.mark.parametrize( + "fn_name,args,expected", + [ + # get with string keys for dict + ("get", ({"x": 42}, "x"), 42), + ("get", ({"x": 42}, "y", 100), 100), + # slice + ("get", ([10, 20, 30, 40, 50], [1, 4]), [20, 30, 40]), + # get with string keys for dict + ("get", ({"a": 1, "b": 2}, "c", 0), 0), + # contains + ("contains", ({"a": 1, "b": 2}, "a"), True), + ("contains", ({"a": 1, "b": 2}, "c"), False), + ("contains", ([1, 2, 3], 2), True), + ("contains", ([1, 2, 3], 4), False), + # constructors + ("list", (1, 2, 3), [1, 2, 3]), + ("dict", ("a", 1, "b", 2), {"a": 1, "b": 2}), + # assoc + ("assoc", ({"a": 1}, "b", 2), {"a": 1, "b": 2}), + ("assoc", ({"a": 1, "b": 2}, "b", 3), {"a": 1, "b": 3}), + # conj + ("conj", ([1, 2], 3), [1, 2, 3]), + # unnest + ("unnest", [[[1, 2], [3, 4]]], [1, 2, 3, 4]), + ("unnest", [[[1, 2], [3, [4]]]], [1, 2, 3, [4]]), # note it's not flattened + ], + ) + def test_builtin_dispatch_returns_expected_values(self, fn_name, args, expected): + """Test behavior of individual built-in functions.""" + fn = BUILTIN_FNS[fn_name] + result = fn(*args) + assert result == expected + + @pytest.mark.parametrize( + "fn_name,args,expected_error,error_message", + [ + # get function errors - lists + ("get", ([1, 2, 3], 0, "default"), TypeError, "Default values not supported for list access"), + ("get", ([1, 2, 3], [1]), ValueError, "Slice key must have exactly 2 elements"), + ("get", ([1, 2, 3], [1, "2"]), TypeError, "Slice indices must be integers"), + ("get", ([1, 2, 3], "not_int"), TypeError, "List indices must be integers"), + # get function errors - dicts + ("get", ({"a": 1}, [1, 2]), TypeError, "Dict keys must be strings"), + ("get", ({"a": 1}, 42), TypeError, "Dict keys must be strings"), + # get function errors - other types + ("get", (42, "key"), TypeError, "Cannot get from object of type int, expected list or dict"), + ("get", ("string", 0), TypeError, "Cannot get from object of type str, expected list or dict"), + # contains function errors + ("contains", (42, 1), TypeError, "Cannot check contains on object of type int, expected list or dict"), + # dict function errors + ("dict", ("a", 1, "b"), ValueError, "Dict requires an even number of arguments"), + ("dict", ([1, 2], 3, 4, 5), TypeError, "Invalid key-value pairs for dict"), + # assoc function errors + ("assoc", (42, "key", "value"), TypeError, "Cannot assoc on object of type int, expected dict"), + ("assoc", ([1, 2, 3], "key", "value"), TypeError, "Cannot assoc on object of type list, expected dict"), + # conj function errors + ("conj", (42, 1), TypeError, "Cannot conj on object of type int, expected list"), + ("conj", ({"a": 1}, 1), TypeError, "Cannot conj on object of type dict, expected list"), + ], + ) + def test_sad_path(self, fn_name, args, expected_error, error_message): + """Test error handling of built-in functions.""" + fn = BUILTIN_FNS[fn_name] + with pytest.raises(expected_error, match=error_message): + fn(*args) diff --git a/tests/contracts/internal/test_config_contract.py b/tests/contracts/internal/test_config_contract.py new file mode 100644 index 0000000..f791c89 --- /dev/null +++ b/tests/contracts/internal/test_config_contract.py @@ -0,0 +1,174 @@ +import os +from unittest.mock import Mock, patch + +import pytest + +from daggerml import Dml +from daggerml._internal.config import DmlConfig + + +def test_config_waterfall_defaults_env_explicit(): + cfg = DmlConfig.resolve( + defaults={"remote.project": "dml://alice/defaults"}, + env={"DML_REMOTE_PROJECT": "dml://alice/env"}, + explicit={"remote.project": "dml://alice/explicit"}, + ) + assert cfg.remote.project == "dml://alice/explicit" + + +def test_repo_env_resolution(): + cfg = DmlConfig.resolve( + env={"DML_PROJECT_HOME": "/repo/new"}, + ) + assert cfg.repo == "/repo/new" + + +def test_default_user_uses_env_user_and_hostname_shape(): + cfg = DmlConfig.resolve(env={"USER": "alice"}) + assert cfg.user is not None + assert cfg.user.startswith("alice") + assert "@" in cfg.user or cfg.user == "alice" + + +def test_path_values_expand_user(): + home = os.path.expanduser("~") + cfg = DmlConfig.resolve( + explicit={"project.home": "~/repo"}, + ) + assert cfg.repo == f"{home}/repo" + + +def test_dml_uses_config_resolution_from_env(monkeypatch): + monkeypatch.setenv("DML_PROJECT_HOME", "/tmp/from-env") + monkeypatch.setenv("DML_REMOTE_PROJECT", "dml://alice/demo") + dml = Dml() + assert dml._context.project_home == "/tmp/from-env" + assert dml._context.remote_root == dml._context.config.remote.root + + +def test_remote_config_from_canonical_env(): + cfg = DmlConfig.resolve( + env={ + "DML_REMOTE_ROOT": "s3://bucket/project", + } + ) + assert cfg.remote.root == "s3://bucket/project" + + +def test_remote_fetch_workers_defaults_to_16(): + cfg = DmlConfig.resolve(env={}) + assert cfg.remote.fetch_workers == 16 + + +def test_remote_fetch_workers_can_be_set_from_env(): + cfg = DmlConfig.resolve(env={"DML_REMOTE_FETCH_WORKERS": "24"}) + assert cfg.remote.fetch_workers == 24 + + +def test_remote_fetch_workers_rejects_invalid_values(): + with pytest.raises(ValueError, match="remote.fetch_workers must be a positive integer"): + DmlConfig.resolve(env={"DML_REMOTE_FETCH_WORKERS": "0"}) + + +def test_remote_project_stays_branchless_and_sets_db_path(): + cfg = DmlConfig.resolve( + explicit={ + "project.home": "/tmp/demo", + "remote.project": "dml://alice/demo", + }, + env={"DML_DEFAULT_BRANCH": "stable"}, + ) + assert cfg.remote.project == "dml://alice/demo" + assert cfg.db.path == "/tmp/demo/.dml/db" + + +def test_global_scope_omits_project_config(tmp_path): + dml_dir = tmp_path / ".dml" + dml_dir.mkdir() + (dml_dir / "config.toml").write_text( + """ +[remote] +project = "dml://alice/demo" +""".strip() + + "\n" + ) + cfg = DmlConfig.resolve(scope="global", explicit={"project.home": str(tmp_path)}) + assert cfg.remote.project is None + + +def test_resolution_precedence_global_project_env_explicit(tmp_path): + config_home = tmp_path / "cfg" + config_home.mkdir() + (config_home / "config.toml").write_text('[defaults]\nbranch = "global"\n') + project_dir = tmp_path / "repo" + (project_dir / ".dml").mkdir(parents=True) + (project_dir / ".dml" / "config.toml").write_text( + """ +[remote] +project = "dml://alice/demo" +""".strip() + + "\n" + ) + + cfg_from_project = DmlConfig.resolve( + explicit={"project.home": str(project_dir)}, + env={"DML_CONFIG_HOME": str(config_home)}, + ) + assert cfg_from_project.remote.project == "dml://alice/demo" + + cfg_from_env = DmlConfig.resolve( + explicit={"project.home": str(project_dir)}, + env={"DML_CONFIG_HOME": str(config_home), "DML_REMOTE_PROJECT": "dml://alice/env"}, + ) + assert cfg_from_env.remote.project == "dml://alice/env" + + cfg = DmlConfig.resolve( + explicit={"project.home": str(project_dir), "remote.project": "dml://alice/explicit"}, + env={"DML_CONFIG_HOME": str(config_home), "DML_REMOTE_PROJECT": "dml://alice/env"}, + ) + assert cfg.remote.project == "dml://alice/explicit" + + +def test_remote_project_rejects_tag_form(): + with pytest.raises(ValueError, match="must not include a branch or tag"): + DmlConfig.resolve(explicit={"remote.project": "dml://alice/demo@v1"}) + + +def test_remote_project_rejects_branch_selector(): + with pytest.raises(ValueError, match="must not include a branch or tag"): + DmlConfig.resolve(explicit={"remote.project": "dml://alice/demo#main"}) + + +def test_remote_config_defaults_to_empty_string(tmp_path): + old = os.getcwd() + os.chdir(tmp_path) + try: + cfg = DmlConfig.resolve(env={"DML_CONFIG_HOME": str(tmp_path / "cfg")}) + assert cfg.remote.root == "" + finally: + os.chdir(old) + + +def test_project_home_defaults_to_cwd_when_unset(tmp_path, monkeypatch): + monkeypatch.delenv("DML_PROJECT_HOME", raising=False) + old = os.getcwd() + os.chdir(tmp_path) + try: + cfg = DmlConfig.resolve(env={}) + assert cfg.project.home == str(tmp_path) + finally: + os.chdir(old) + + +@patch("daggerml._internal.dml.RemoteOps") +def test_dml_ops_remote_uses_configured_fetch_workers(mock_remote_ops): + from daggerml._internal.dml import Dml, make_remote_ops + + dml = Dml(project_home="/tmp/repo", remote_root="s3://bucket/prefix") + object.__setattr__(dml._context.config.remote, "fetch_workers", 9) + make_remote_ops(Mock(), dml) + + kwargs = mock_remote_ops.call_args.kwargs + assert kwargs["bucket"] == "bucket" + assert kwargs["prefix"] == "prefix/dml" + assert kwargs["fetch_workers"] == 9 diff --git a/tests/contracts/internal/test_dml_introspection_contract.py b/tests/contracts/internal/test_dml_introspection_contract.py new file mode 100644 index 0000000..f3188b7 --- /dev/null +++ b/tests/contracts/internal/test_dml_introspection_contract.py @@ -0,0 +1,125 @@ +import inspect +from typing import Annotated, get_args, get_origin, get_type_hints + +from daggerml._internal.dml import ( + Dml, + _AdminCacheNamespace, + _AdminIndexNamespace, + _AdminNamespace, + _AdminRemoteNamespace, + _ConfigNamespace, + _DagNamespace, + _RuntimeNamespace, +) + + +def _assert_docstrings(cls, method_names): + assert inspect.getdoc(cls) + for method_name in method_names: + assert inspect.getdoc(getattr(cls, method_name)), f"Missing docstring for {cls.__name__}.{method_name}" + + +def _assert_annotated_help(fn, parameter_names): + hints = get_type_hints(fn, include_extras=True) + for parameter_name in parameter_names: + annotation = hints[parameter_name] + assert get_origin(annotation) is Annotated, f"{fn.__qualname__}.{parameter_name} is not Annotated" + extras = get_args(annotation)[1:] + assert extras, f"{fn.__qualname__}.{parameter_name} is missing help metadata" + assert isinstance(extras[0], str) + assert extras[0] + + +def test_public_dml_classes_and_methods_have_docstrings(): + _assert_docstrings( + Dml, + [ + "__init__", + "status", + "branch", + "log", + "show", + "diff", + "checkout", + "fetch", + "pull", + "push", + "merge", + "revert", + "init", + ], + ) + _assert_docstrings(_ConfigNamespace, ["get", "set", "show"]) + _assert_docstrings( + _RuntimeNamespace, + [ + "create", + "get_node", + "get_argv", + "put_literal", + "put_import", + "set_node_name", + "start_fn", + "commit", + "list", + "describe", + "cancel", + ], + ) + _assert_docstrings( + _DagNamespace, + ["list", "describe", "get", "describe_node", "get_node", "unroll_node", "checkout", "delete"], + ) + _assert_docstrings(_AdminNamespace, ["gc"]) + _assert_docstrings(_AdminIndexNamespace, ["list", "get", "delete"]) + _assert_docstrings(_AdminCacheNamespace, ["invalidate"]) + _assert_docstrings(_AdminRemoteNamespace, ["list", "gc"]) + + +def test_public_dml_annotations_include_help_metadata(): + _assert_annotated_help(Dml.__init__, ["project_home", "remote_root", "user", "config_home"]) + _assert_annotated_help(Dml.branch, ["remote"]) + _assert_annotated_help(Dml.log, ["revision", "limit"]) + _assert_annotated_help(Dml.show, ["revision"]) + _assert_annotated_help(Dml.diff, ["left", "right"]) + _assert_annotated_help(Dml.checkout, ["revision"]) + _assert_annotated_help(Dml.fetch, ["remote_or_uri", "branch"]) + _assert_annotated_help(Dml.pull, ["remote_or_uri", "remote_branch", "branch", "user"]) + _assert_annotated_help(Dml.push, ["tag", "branch", "create", "force"]) + _assert_annotated_help(Dml.merge, ["revision", "branch", "user"]) + _assert_annotated_help(Dml.revert, ["revision", "branch", "user"]) + _assert_annotated_help( + Dml.init, + ["project_home", "remote_root", "user", "config_home", "remote_project"], + ) + + _assert_annotated_help(_ConfigNamespace.get, ["key", "scope"]) + _assert_annotated_help(_ConfigNamespace.set, ["key", "value", "scope"]) + _assert_annotated_help(_ConfigNamespace.show, ["contrib"]) + + _assert_annotated_help(_RuntimeNamespace.create, ["head", "commit", "argv_ptr", "index_id"]) + _assert_annotated_help(_RuntimeNamespace.get_node, ["index_id", "name"]) + _assert_annotated_help(_RuntimeNamespace.get_argv, ["index_id"]) + _assert_annotated_help(_RuntimeNamespace.put_literal, ["index_id", "value", "name"]) + _assert_annotated_help(_RuntimeNamespace.put_import, ["index_id", "dag", "node", "name"]) + _assert_annotated_help(_RuntimeNamespace.set_node_name, ["index_id", "name", "node"]) + _assert_annotated_help(_RuntimeNamespace.start_fn, ["index_id", "argv", "kwargv", "name"]) + _assert_annotated_help(_RuntimeNamespace.commit, ["index_id", "value", "head", "message", "dag_name"]) + _assert_annotated_help(_RuntimeNamespace.describe, ["index_id"]) + _assert_annotated_help(_RuntimeNamespace.cancel, ["index_id"]) + + _assert_annotated_help(_DagNamespace.list, ["revision"]) + _assert_annotated_help(_DagNamespace.describe, ["value", "revision"]) + _assert_annotated_help(_DagNamespace.get, ["value", "revision"]) + _assert_annotated_help(_DagNamespace.describe_node, ["node", "dag", "revision"]) + _assert_annotated_help(_DagNamespace.get_node, ["node", "dag", "revision"]) + _assert_annotated_help(_DagNamespace.unroll_node, ["node", "dag", "revision"]) + _assert_annotated_help(_DagNamespace.checkout, ["revision", "dag_name", "branch", "target_name", "replace", "user"]) + _assert_annotated_help(_DagNamespace.delete, ["name", "branch", "user"]) + + _assert_annotated_help(_AdminNamespace.gc, ["dry_run"]) + _assert_annotated_help(_AdminIndexNamespace.get, ["index_id"]) + _assert_annotated_help(_AdminIndexNamespace.delete, ["index_id"]) + _assert_annotated_help(_AdminCacheNamespace.invalidate, ["cache_keys"]) + _assert_annotated_help(_AdminRemoteNamespace.list, ["project", "owner"]) + _assert_annotated_help(_AdminRemoteNamespace.gc, ["min_age_seconds", "malformed"]) diff --git a/tests/contracts/internal/test_types_contract.py b/tests/contracts/internal/test_types_contract.py new file mode 100644 index 0000000..4ceeec4 --- /dev/null +++ b/tests/contracts/internal/test_types_contract.py @@ -0,0 +1,401 @@ +"""Comprehensive tests for types.py module with Hypothesis property-based testing.""" + +from collections import defaultdict + +import pytest +from hypothesis import assume, given, settings +from hypothesis import strategies as st + +from daggerml._internal.types import ( + DEFAULT_HEAD, + NAMESPACES, + NONE, + ArgvNode, + Collection, + Commit, + Dag, + Deletable, + DictDatum, + DmlBase, + DmlRepoError, + Error, + FnNode, + ImportNode, + KwargvNode, + ListDatum, + LiteralNode, + MaybeRefCollection, + MaybeRefScalar, + RefCollection, + RunnableDatum, + Scalar, + ScalarDatum, + Tree, + Uri, + _register_dml_obj, +) +from tests.contracts.internal.support.test_db_support import REF_ALPHABET, STR_ALPHABET, _refs + + +def _scalar_value_strategy(): + """Strategy for scalar values.""" + return st.one_of( + st.integers(min_value=-(2**63), max_value=2**63 - 1), + st.floats(allow_nan=False, allow_infinity=False), + st.booleans(), + st.text(), + st.none(), + ) + + +def _error_strategy(): + return st.builds( + Error, + message=st.text(alphabet=STR_ALPHABET, max_size=16), + origin=st.text(alphabet=STR_ALPHABET, max_size=16), + type=st.text(alphabet=STR_ALPHABET, max_size=16), + stack=st.lists( + st.dictionaries( + st.text(alphabet=REF_ALPHABET, max_size=8), + st.text(alphabet=REF_ALPHABET, max_size=8), + max_size=3, + ), + max_size=3, + ), + ) + + +def _dag_strategy(): + @st.composite + def _draw_dag(draw): + nodes = draw(st.lists(_node_ref, max_size=4)) + result = error = argv = None + if nodes: + names = draw( + st.dictionaries( + st.text(alphabet=REF_ALPHABET, min_size=1, max_size=8), + st.sampled_from(nodes), + max_size=4, + ) + ) + tmp = draw(st.one_of(st.none(), st.sampled_from(nodes), _refs("error"))) + if tmp is not None and tmp.ns() == "error": + error = tmp + else: + result = tmp + argv_nodes = [n for n in nodes if n.ns() == "node-argv"] + argv = draw(st.one_of(st.none(), st.sampled_from(argv_nodes) if argv_nodes else st.none())) + else: + names = {} + error = draw(st.one_of(st.none(), _refs("error"))) + return Dag(nodes=nodes, names=names, result=result, argv=argv, error=error) + + return _draw_dag() + + +def _tree_strategy(): + return st.builds( + Tree, + dags=st.dictionaries( + st.text(alphabet=REF_ALPHABET, min_size=1, max_size=8), + _refs("dag"), + max_size=4, + ), + ) + + +def _commit_strategy(): + return st.builds( + Commit, + parents=st.lists(_refs("commit"), max_size=3), + tree=_refs("tree"), + author=st.text(alphabet=REF_ALPHABET, max_size=16), + message=st.text(alphabet=REF_ALPHABET, max_size=64), + dag=st.one_of(st.none(), _refs("dag")), + ) +def _deletable_strategy(): + return st.builds( + Deletable, + uri=st.text(alphabet=REF_ALPHABET + ":/", min_size=1, max_size=32), + ) + + +def _uri_strategy(): + return st.builds( + Uri, + uri=st.text(alphabet=REF_ALPHABET + ":/", min_size=1, max_size=64), + ) + + +def _runnable_strategy(): + return st.builds( + RunnableDatum, + target=_refs("datum", "uri"), + sub=st.one_of(st.none(), _refs("datum", "runnable")), + kwargs=_refs("datum", "dict"), + adapter=st.text(alphabet=REF_ALPHABET, min_size=1, max_size=16), + ) + + +def _literal_node_strategy(): + return st.builds(LiteralNode, value=_datum_ref) + + +def _argv_node_strategy(): + return st.builds(ArgvNode, value=_datum_ref) + + +def _kwargv_node_strategy(): + return st.builds(KwargvNode, value=_datum_ref) + + +_node_ref = st.one_of(*[_refs("node", t) for t in ["literal", "argv", "kwargv", "import", "fn"]]) +_datum_ref = st.one_of(*[_refs("datum", t) for t in ["scalar", "list", "dict", "uri", "runnable"]]) + + +def _import_node_strategy(): + return st.builds(ImportNode, dag=_refs("dag"), node=_node_ref) + + +def _fn_node_strategy(): + return st.builds( + FnNode, + dag=_refs("dag"), + argv=st.lists(_node_ref, max_size=3), + ) + + +def _node_strategy(): + return st.one_of( + _literal_node_strategy(), + _argv_node_strategy(), + _kwargv_node_strategy(), + _import_node_strategy(), + _fn_node_strategy(), + ) + + +def _scalar_datum_strategy(): + return st.builds(ScalarDatum, data=_scalar_value_strategy()) + + +def _list_datum_strategy(): + return st.builds(ListDatum, data=st.lists(_datum_ref, max_size=3)) + + +def _dict_datum_strategy(): + return st.builds(DictDatum, data=st.dictionaries(st.text(max_size=8), _datum_ref, max_size=3)) + + +def _datum_strategy(): + return st.one_of( + _scalar_datum_strategy(), + _list_datum_strategy(), + _dict_datum_strategy(), + _uri_strategy(), + _runnable_strategy(), + ) + + +def _dml_obj_strategy(): + return st.one_of( + _datum_strategy(), + _error_strategy(), + _deletable_strategy(), + _dag_strategy(), + _tree_strategy(), + _commit_strategy(), + _node_strategy(), + ) + + +class TestDmlObjDecorator: + """Test the dml_obj decorator functionality.""" + + def test_register_dml_obj_registration(self): + """Test that dml_obj decorator registers classes in NAMESPACES.""" + initial_namespaces = len(NAMESPACES) + + @_register_dml_obj + class TestClass: + pass + + assert "testclass" in NAMESPACES + assert NAMESPACES["testclass"] is TestClass + assert hasattr(TestClass, "_ns") + assert TestClass._ns == "testclass" + assert len(NAMESPACES) == initial_namespaces + 1 + + @given(st.text(alphabet="abcdefghijklmnopqrstuvwxyz", min_size=1, max_size=20)) + def test_register_dml_obj_lowercase_conversion(self, class_name): + """Test decorator converts class names to lowercase for namespace.""" + + @_register_dml_obj + class TempClass: + pass + + # Temporarily set the class name + TempClass.__name__ = class_name + expected_ns = class_name.lower() + + # Re-register to test the name conversion + obj = _register_dml_obj(TempClass) + assert obj._ns == expected_ns + + +class TestDmlBase: + """Test base class functionality.""" + + def test_to_dict_excludes_private(self): + """Test that to_dict excludes private attributes.""" + from dataclasses import dataclass + + @dataclass + class TestClass(DmlBase): + public: str + _private: str = "hidden" + + obj = TestClass(public="visible", _private="hidden") + result = obj.to_dict() + assert "public" in result + assert "_private" not in result + assert result["public"] == "visible" + + @given( + st.dictionaries( + st.text(alphabet="abcdefghijklmnopqrstuvwxyz", min_size=1, max_size=10).filter( + lambda key: key not in {"self", "cls"} + ), + st.one_of(st.text(max_size=20), st.integers()), + min_size=1, + max_size=5, + ) + ) + def test_from_dict_creates_instance(self, field_data): + """Test that from_dict creates correct instance with arbitrary data.""" + + @_register_dml_obj + class TestClass(DmlBase): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + instance = TestClass.from_dict(field_data) + for key, value in field_data.items(): + assert getattr(instance, key) == value + + +class TestDataClasses: + """Test basic data class functionality with property-based testing.""" + + def test_error_from_exception(self): + """Test Error.from_ex creates Error from exception.""" + try: + raise ValueError("test error") + except Exception as e: + error = Error.from_ex(e) + assert error.message == "test error" + assert error.origin == "python" + assert error.type == "valueerror" + assert len(error.stack) > 0 + + @given(_dag_strategy().filter(lambda d: d.names), _refs("node")) + def test_dag_nameof(self, dag, node_ref): + """Test DAG nameof method with generated data.""" + assume(node_ref not in dag.names.values()) + reverse_map = defaultdict(list) + for name, ref in dag.names.items(): + reverse_map[ref].append(name) + for ref, names in reverse_map.items(): + assert dag.nameof(ref) in names + # Test with non-existent ref + if node_ref not in dag.names.values(): + assert dag.nameof(node_ref) is None + + @given(_uri_strategy()) + def test_deletable_from_uri(self, uri_datum): + """Test Deletable.from_uri creates deletable.""" + deletable = Deletable.from_uri(uri_datum) + assert deletable.uri == uri_datum.uri + assert isinstance(deletable, Deletable) + + @given(_dml_obj_strategy()) + def test_registered_type_roundtrips_via_to_dict_from_dict(self, obj): + """Test that all registered types can roundtrip through to_dict/from_dict.""" + obj_dict = obj.to_dict() + restored = type(obj).from_dict(obj_dict) + assert obj == restored + + @given(_dag_strategy().filter(lambda d: d.argv is None)) + def test_dag_cache_key_requires_argv(self, temp_bo, dag): + """Test that cache_key requires argv. + + Uses a real transaction context from the `temp_bo` fixture to exercise + `Dag.cache_key` with a real `TxnContext` instead of a casted mock. + """ + with pytest.raises(DmlRepoError, match="Cannot compute cache key for DAG without argv"): + with temp_bo._tx(readonly=True) as txn: + dag.cache_key(txn) + + +class TestNodeTypes: + """Test node type registration and serialization.""" + + @given(_refs("dag"), _node_ref, _refs("datum", "scalar")) + @settings(max_examples=1) + def test_import_node_datum_ref(self, temp_bo, dag_ref, node_ref, datum_ref): + """ImportNode.datum_ref reads imported node value via ops.""" + node = ImportNode(dag=dag_ref, node=node_ref) + with temp_bo._tx() as txn: + datum_ref = txn.put(ScalarDatum(data=123), to=datum_ref) + txn.put(LiteralNode(value=datum_ref), to=node_ref) + assert node.datum_ref(txn) == datum_ref + + +class TestConstants: + """Test module constants.""" + + def test_constants_defined(self): + """Test that required constants are defined.""" + assert NONE is not None + assert DEFAULT_HEAD == "main" + + """Test type alias definitions.""" + + def test_type_aliases_importable(self): + """Test that type aliases can be imported and used.""" + # Just test that they're importable - type checking is done by mypy + assert Scalar is not None + assert MaybeRefScalar is not None + assert Collection is not None + assert MaybeRefCollection is not None + assert RefCollection is not None + + +class TestRegistries: + """Test namespace and nodetype registries.""" + + def test_registries_populated(self): + """Test that registries contain expected entries.""" + # Check that _register_dml_obj classes are registered + expected_namespaces = { + "commit", + "dag", + "datum-scalar", + "datum-list", + "datum-dict", + "datum-uri", + "datum-runnable", + "deletable", + "error", + "tree", + } + for namespace in expected_namespaces: + assert namespace in NAMESPACES + + @given(st.sampled_from(list(NAMESPACES.keys()))) + def test_namespace_classes_have_ns_attribute(self, namespace): + """Test that registered classes have correct _ns attribute.""" + cls = NAMESPACES[namespace] + if hasattr(cls, "_ns"): + assert cls._ns == namespace diff --git a/tests/contracts/runtime/test_default_runtime_status.py b/tests/contracts/runtime/test_default_runtime_status.py new file mode 100644 index 0000000..622dc8c --- /dev/null +++ b/tests/contracts/runtime/test_default_runtime_status.py @@ -0,0 +1,87 @@ +from typing import cast + +import pytest + +import daggerml as dml +from daggerml.api import Dml +from tests import temporary_dml + + +@pytest.fixture(autouse=True) +def _reset_default_runtime(): + dml.clear_default_dml() + yield + dml.clear_default_dml() + + +def _default_info(status: dict) -> dict: + return status["default"] + + +def _repo_status(status: dict) -> dict: + return status["status"] + + +def test_default_runtime_status_DRT_STS_001_reports_implicit_default_creation_source(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + dml.clear_default_dml() + status = dml.status() + info = _default_info(status) + assert info["source"] == "implicit" + assert info["has_scoped_override"] is False + assert info["has_process_default"] is True + + repo = _repo_status(status) + assert set(repo.keys()) == {"head", "branches", "dags", "indexes"} + assert repo["head"] is None + assert repo["branches"] == [] + assert repo["dags"] == {} + assert repo["indexes"] == [] + + +def test_default_runtime_status_DRT_STS_002_get_default_dml_is_cached_process_default(): + dml.clear_default_dml() + dml0 = dml.get_default_dml() + dml1 = dml.get_default_dml() + assert dml0 is dml1 + + status = dml.status() + assert _default_info(status)["source"] == "process" + + +def test_default_runtime_status_DRT_STS_003_set_and_scoped_default_runtime_resolution(): + with temporary_dml(repo="a") as raw_a, temporary_dml(repo="b") as raw_b: + dml_a = cast(Dml, raw_a) + dml_b = cast(Dml, raw_b) + dml.set_default_dml(dml_a) + assert dml.get_default_dml() is dml_a + assert _default_info(dml.status())["source"] == "process" + + with dml.use_default_dml(dml_b): + assert dml.get_default_dml() is dml_b + scoped = _default_info(dml.status()) + assert scoped["source"] == "scoped" + assert scoped["has_scoped_override"] is True + + assert dml.get_default_dml() is dml_a + assert _default_info(dml.status())["source"] == "process" + + +def test_default_runtime_status_DRT_STS_004_top_level_new_and_load_delegate_to_default_runtime(): + with temporary_dml(repo="default-runtime") as raw_dml: + default_dml = cast(Dml, raw_dml) + dml.set_default_dml(default_dml) + with dml.new(dml=default_dml, name="d0", message="msg") as dag: + dag.put(42, name="n0") + dag.commit("ok") + + loaded = dml.load("d0", dml=default_dml) + assert loaded.result.value() == "ok" + assert loaded["n0"].value() == 42 + + +def test_temporary_runtime_uses_default_branch_for_active_head(): + with temporary_dml(repo="temp-head") as raw_runtime: + runtime = cast(Dml, raw_runtime) + assert runtime._context.project_home is not None + assert runtime.branch()["head"] == "main" diff --git a/tests/contrib/assets/docker_build_ctx/Dockerfile b/tests/contrib/assets/docker_build_ctx/Dockerfile new file mode 100644 index 0000000..f6c1c1f --- /dev/null +++ b/tests/contrib/assets/docker_build_ctx/Dockerfile @@ -0,0 +1,4 @@ +FROM alpine:3.20 +COPY run.sh /run.sh +RUN chmod +x /run.sh +CMD ["/run.sh"] diff --git a/tests/contrib/assets/docker_build_ctx/run.sh b/tests/contrib/assets/docker_build_ctx/run.sh new file mode 100644 index 0000000..e4eebc6 --- /dev/null +++ b/tests/contrib/assets/docker_build_ctx/run.sh @@ -0,0 +1,2 @@ +#!/bin/sh +echo docker-build-ok diff --git a/tests/contrib/assets/docker_executor_ctx/Dockerfile b/tests/contrib/assets/docker_executor_ctx/Dockerfile new file mode 100644 index 0000000..ea88991 --- /dev/null +++ b/tests/contrib/assets/docker_executor_ctx/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.14-slim + +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + SETUPTOOLS_SCM_PRETEND_VERSION=0.0 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential cmake \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/daggerml + +COPY . /opt/daggerml + +RUN pip install --no-cache-dir boto3 pandas pyarrow /opt/daggerml diff --git a/tests/integration/.gitkeep b/tests/integration/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/contrib/test_funkify_integration.py b/tests/integration/contrib/test_funkify_integration.py new file mode 100644 index 0000000..adbc103 --- /dev/null +++ b/tests/integration/contrib/test_funkify_integration.py @@ -0,0 +1,413 @@ +from __future__ import annotations + +import os +import time +from dataclasses import dataclass +from typing import Any, cast + +import pytest + +from daggerml import clear_default_dml, load, new, set_default_dml +from daggerml._internal.dml import make_index_ops, with_db +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.codecs import CodecError +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import api +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import LocalAdapter +from daggerml.contrib.executors import ScriptExecutor +from daggerml.contrib.executors.script import run_payload +from daggerml.contrib.testing import defunkify +from tests import temporary_dml + +pytestmark = pytest.mark.slow + + +@pytest.fixture(autouse=True) +def _reset_registry(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + areg.register_adapter(LocalAdapter) + + class InnerExecutor: + name = "inner" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + class CustomExecutor: + name = "custom" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + ereg.register_executor(ScriptExecutor) + ereg.register_executor(InnerExecutor) + ereg.register_executor(CustomExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +def _remote() -> dict[str, str]: + return {"root": os.environ["DML_REMOTE_ROOT"]} + + +def _mk_argv_ptr(*args: Any, argv0: Any | None = None) -> str: + with temporary_dml() as dml: + dag = new(dml=dml, name="argv-src", message="argv-src") + index_ref = dag._require_index_ref() + head = argv0 if argv0 is not None else Runnable(target=Uri("daggerml:list"), kwargs={}, adapter="") + with with_db(dml) as db: + index_ops = make_index_ops(db, dml) + fn_ref = index_ops.put_literal(index_ref, head) + arg_refs = [index_ops.put_literal(index_ref, value) for value in args] + with index_ops._tx(readonly=False) as txn: + argv_ref = index_ops._prepare_fn(index_ref, [fn_ref, *arg_refs], {}, txn) + return index_ops._remote_ops().put_ref_manifest(argv_ref) + + +def _poll_until_terminal( + *, runnable: Runnable, argv_ptr: str, cache_key: str, initial_state: dict[str, Any] | None = None +) -> dict[str, Any]: + execution_id = f"exec-{cache_key}" + state: dict[str, Any] | None = initial_state + for _ in range(200): + result = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=_remote(), + state=state, + execution_status=None, + cancel_requested_by=None, + ) + if state is None and result.get("status") == "running": + state = cast(dict[str, Any], result.get("state")) + if result["status"] in {"succeeded", "failed"}: + return cast(dict[str, Any], result) + time.sleep(0.01) + pytest.fail("script executor did not reach terminal state") + + +def test_funkify_decorator_returns_delayed_runnable(): + def fn_impl(dag, x=1): + return x + + fn = api.funkify(fn_impl, uri="script", adapter="local") + + assert isinstance(fn, api.DelayedRunnable) + assert fn.uri == "script" + assert fn.adapter == "local" + assert fn.sub is None + assert "fn" in fn.kwargs + assert defunkify(fn).__wrapped__ is fn_impl + + +def test_funkify_wrapper_returns_delayed_runnable(): + inner = api.DelayedRunnable(uri="inner", adapter="local", sub=None, kwargs={}) + wrapped = api.funkify(inner, uri="script", adapter="local", x=1) + assert isinstance(wrapped, api.DelayedRunnable) + assert wrapped.sub is inner + assert wrapped.kwargs == {"x": 1} + + +def test_funkify_wrapper_preserves_innermost_script_for_defunkify(): + def fn_impl(dag): + return 1 + + inner = api.funkify(fn_impl, uri="script", adapter="local") + + wrapped = api.funkify(inner, uri="custom", adapter="local", x=1) + + assert defunkify(wrapped).__wrapped__ is fn_impl + + +def test_funkify_invalid_input_fails(): + with pytest.raises(DmlRepoError, match="Invalid funkify input"): + api.funkify(cast(Any, 123), uri="script", adapter="local") + + +def test_funkify_with_ref_and_load_normalizes_via_codec(): + with temporary_dml() as dml: + src = new(dml=dml, name="src", message="src") + src.commit(9) + + dag = new(dml=dml, name="dst", message="dst") + dag.a = 7 + inner = api.DelayedRunnable(uri="inner", adapter="local", sub=None, kwargs={}) + delayed = api.funkify(inner, uri="custom", adapter="local", x=api.ref("a"), y=api.load("src")) + node = dag.put(cast(Any, delayed)) + rv = node.value() + assert isinstance(rv, Runnable) + assert rv.target.uri == "custom" + assert rv.adapter == "dml-local-adapter" + assert rv.kwargs["x"] == 7 + assert rv.kwargs["y"] == 9 + + +@pytest.mark.parametrize("resolved_adapter", ["podman-adapter", "/opt/acme/bin/acme-adapter"]) +def test_funkify_plugin_adapter_sugar_resolves_to_concrete_runtime_adapter(resolved_adapter): + @dataclass + class PluginAdapter: + name: str = "gpu" + executable: str = resolved_adapter + + def resolve_runnable(self, uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter=self.executable) + + @staticmethod + def send(runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + areg.register_adapter(PluginAdapter()) + + with temporary_dml() as dml: + dag = new(dml=dml, name="gpu", message="gpu") + inner = api.DelayedRunnable(uri="inner", adapter="local", sub=None, kwargs={}) + delayed = api.funkify(inner, uri="custom", adapter="gpu") + rv = dag.put(cast(Any, delayed)).value() + + assert isinstance(rv, Runnable) + assert rv.target.uri == "custom" + assert rv.adapter == resolved_adapter + + +def test_funkify_script_runnable_contains_executable_fn_script(): + def helper(x): + return x + 1 + + def fn(dag, x, y=2): + return helper(x) + y + + with temporary_dml() as dml: + dag = new(dml=dml, name="dst", message="dst") + delayed = api.funkify(fn, uri="script", adapter="local", extra_objs=[helper]) + node = dag.put(cast(Any, delayed)) + rv = node.value() + assert isinstance(rv, Runnable) + assert "script" not in rv.kwargs + assert isinstance(rv.kwargs.get("__dml_script_exec__"), dict) + from daggerml.contrib.s3 import S3Store + + script = S3Store().get(rv.kwargs["__dml_script_exec__"]["script_uri"]).decode("utf-8") + + namespace: dict[str, Any] = {} + exec(script, namespace) + result = namespace["fn"](object(), 4, y=3) + assert result == 8 + + +@pytest.mark.parametrize( + "contract_id,stage,use_prepop,cache_key", + [ + pytest.param( + "FKY-LFC-001", "kickoff", False, "ck-funkify-int-1-kickoff", id="FKY-LFC-001:kickoff-returns-running" + ), + pytest.param( + "FKY-LFC-002", "resume", False, "ck-funkify-int-1-resume", id="FKY-LFC-002:resume-poll-returns-running" + ), + pytest.param( + "FKY-LFC-003", "terminal", False, "ck-funkify-int-1-terminal", id="FKY-LFC-003:terminal-succeeded" + ), + pytest.param( + "FKY-LFC-004", + "terminal", + True, + "ck-funkify-int-2", + id="FKY-LFC-004:terminal-succeeded-with-subchain-prepop", + ), + ], +) +def test_funkify_script_lifecycle_stage_matrix_FKY_LFC_001_to_FKY_LFC_004(contract_id, stage, use_prepop, cache_key): + del contract_id + decorate = api.funkify(uri="script", adapter="local") + nonce = {"kickoff": 1, "resume": 2, "terminal": 3}[stage] + + if use_prepop: + + @decorate + def fn(dag): + return dag.seed.value() * 2 + else: + + @decorate + def fn(dag, x, y=2, *, z=3, nonce=nonce): + return x.value() + y.value() + z.value() # pyright: ignore[reportAttributeAccessIssue] + + with temporary_dml() as dml: + dag_name = "dst-prepop" if use_prepop else "dst-int" + dag = new(dml=dml, name=dag_name, message=dag_name) + runnable = cast(Runnable, dag.put(cast(Any, fn)).value()) + + if use_prepop: + meta = cast(dict[str, Any], runnable.kwargs["__dml_script_exec__"]) + inner = Runnable( + target=Uri("inner"), + adapter="dml-local-adapter", + kwargs={ + "__dml_script_exec__": { + "prepop": {"seed": 6}, + "fn_name": cast(str, meta["fn_name"]), + "script_uri": cast(str, meta["script_uri"]), + } + }, + sub=None, + ) + outer = Runnable(target=Uri("outer"), adapter="dml-local-adapter", kwargs={}, sub=inner) + argv_ptr = _mk_argv_ptr(argv0=outer) + else: + argv_ptr = _mk_argv_ptr(4, argv0=runnable) + + kickoff = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=f"exec-{cache_key}", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + assert kickoff["status"] == "running" + + if stage == "kickoff": + return + + resumed = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=f"exec-{cache_key}", + remote=_remote(), + state=cast(dict[str, Any], kickoff["state"]), + execution_status=None, + cancel_requested_by=None, + ) + assert resumed["status"] == "running" + if stage == "resume": + return + + result = _poll_until_terminal( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + initial_state=cast(dict[str, Any], resumed["state"]), + ) + assert result["status"] == "succeeded", result + + +def test_dagclass_compiled_method_executes_through_local_script_runtime(): + @api.dagclass + class Example: + x: Any = 2 + + def main(self, a, b=1): + return self.x.value() + a.value() * b.value() # pyright: ignore[reportAttributeAccessIssue] + + with temporary_dml() as dml: + set_default_dml(dml) + try: + result = api.run(Example(), 4, b=3, name="dagclass-runtime-int") + assert result is None + + loaded = load("dagclass-runtime-int", dml=dml) + assert loaded["x"].value() == 2 + assert loaded[""].value() == 14 + assert loaded.result.value() == 14 + finally: + clear_default_dml() + + +def test_funkify_script_runtime_executes_generated_source_with_args_and_kwargs(tmp_path): + nonce = len(str(tmp_path)) + + def fn(dag, x, y=2, *, z=3, nonce=nonce): + return x.value() + y.value() + z.value() # pyright: ignore[reportAttributeAccessIssue] + + delayed = api.funkify(fn, uri="script", adapter="local") + with temporary_dml() as dml: + dag = new(dml=dml, name="dst-worker", message="dst-worker") + runnable = cast(Runnable, dag.put(cast(Any, delayed)).value()) + os.environ["DML_PROJECT_HOME"] = cast(str, dml._context.project_home) + try: + result = run_payload( + _mk_argv_ptr(4, argv0=runnable), + execution_id="exec-worker", + cache_key="ck-worker", + remote_root=_remote()["root"], + ) + finally: + os.environ.pop("DML_PROJECT_HOME", None) + + assert result["status"] == "succeeded" + assert result["error"] is None + assert isinstance(result["dag_id"], str) + assert result["dag_id"] + + +def test_funkify_resolve_runnable_requires_runnable_return(): + @dataclass + class BadAdapter: + name: str = "bad" + executable: str = "bad-adapter" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return (uri, kwargs, sub) + + @staticmethod + def send(runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + @staticmethod + def cli(argv=None): + return 0 + + areg.register_adapter(BadAdapter()) + + with temporary_dml() as dml: + dag = new(dml=dml, name="d0", message="d0") + delayed = api.funkify(lambda dag: None, uri="script", adapter="bad") + with pytest.raises(CodecError, match="resolve_runnable must return Runnable"): + dag.put(cast(Any, delayed)) diff --git a/tests/integration/contrib/test_funks_integration.py b/tests/integration/contrib/test_funks_integration.py new file mode 100644 index 0000000..c67b93a --- /dev/null +++ b/tests/integration/contrib/test_funks_integration.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import io +import json +import shutil +import subprocess +import tarfile +from pathlib import Path + +import pytest + +from daggerml import Uri, new +from daggerml.contrib import api +from daggerml.contrib.funks import docker_build +from daggerml.contrib.s3 import S3Store +from daggerml.contrib.testing import MockNode, defunkify +from tests import temporary_dml + +pytestmark = pytest.mark.slow + + +class FakeDag: + def put(self, value, *, name=None): + assert isinstance(value, Uri) + return MockNode(value) + + +ASSETS_DIR = Path(__file__).parents[2] / "contrib" / "assets" / "docker_build_ctx" + + +def _docker_available() -> bool: + if shutil.which("docker") is None: + return False + probe = subprocess.run(["docker", "info"], check=False, capture_output=True, text=True) + return probe.returncode == 0 + + +def _saved_image_tag(image_tarball: bytes) -> str: + with tarfile.open(fileobj=io.BytesIO(image_tarball), mode="r") as tf: + manifest = json.loads(tf.extractfile("manifest.json").read()) + return manifest[0]["RepoTags"][0] + + +def test_docker_build_is_funkified_script_callable(): + assert isinstance(docker_build, api.DelayedRunnable) + assert docker_build.uri == "script" + assert docker_build.adapter == "local" + assert defunkify(docker_build).__name__ == "docker_build" + + +def test_docker_build_builds_and_uploads_image_tar(monkeypatch): + calls: list[tuple[str, ...]] = [] + call = defunkify(docker_build) + + class FakeStore: + def untar(self, tar_uri, dest, *, unsafe=False): + assert tar_uri == Uri("s3://bucket/context.tar") + assert unsafe is False + + def put(self, data=None, filepath=None, *, suffix=""): + assert data is None + assert filepath is not None + assert suffix == ".tar" + return Uri("s3://bucket/image.tar") + + class FakeUuid: + hex = "abc123" + + monkeypatch.setattr("uuid.uuid4", lambda: FakeUuid()) + monkeypatch.setattr("daggerml.contrib.s3.S3Store", FakeStore) + monkeypatch.setattr("daggerml.contrib.funks._run", lambda *cmd: calls.append(cmd)) + + result = call(FakeDag(), Uri("s3://bucket/context.tar"), ["--platform=linux/amd64", "--no-cache"]) + + assert result == Uri("s3://bucket/image.tar") + assert calls[0][:4] == ("docker", "build", "--platform=linux/amd64", "--no-cache") + assert calls[0][-2:] == ("dml:abc123", calls[0][-1]) + assert calls[1][:3] == ("docker", "save", "-o") + assert calls[1][-1] == "dml:abc123" + + +def test_docker_build_pushes_when_repo_is_provided(monkeypatch): + calls: list[tuple[str, ...]] = [] + call = defunkify(docker_build) + + class FakeStore: + def untar(self, tar_uri, dest, *, unsafe=False): + return None + + def put(self, data=None, filepath=None, *, suffix=""): + return Uri("s3://bucket/image.tar") + + class FakeUuid: + hex = "abc123" + + monkeypatch.setattr("uuid.uuid4", lambda: FakeUuid()) + monkeypatch.setattr("daggerml.contrib.s3.S3Store", FakeStore) + monkeypatch.setattr("daggerml.contrib.funks._run", lambda *cmd: calls.append(cmd)) + + result = call(FakeDag(), Uri("s3://bucket/context.tar"), [], Uri("repo/name")) + + assert result == MockNode(Uri("repo/name:abc123")) + assert ("docker", "tag", "dml:abc123", "repo/name:abc123") in calls + assert ("docker", "push", "repo/name:abc123") in calls + + +def test_docker_build_in_dag_builds_runnable_image(tmp_path): + if not _docker_available(): + pytest.skip("docker daemon is not available") + + store = S3Store() + context_tarball = store.tar(ASSETS_DIR) + call = defunkify(docker_build) + + with temporary_dml() as dml: + with new(dml=dml, name="docker-build-int", message="docker-build-int") as dag: + image_tar_uri = call(dag, context_tarball) + + assert isinstance(image_tar_uri, Uri) + image_tarball = store.get(image_tar_uri) + image_tag = _saved_image_tag(image_tarball) + + subprocess.run(["docker", "image", "rm", "-f", image_tag], check=False, capture_output=True, text=True) + + tmp_tar = tmp_path / "docker-image.tar" + try: + tmp_tar.write_bytes(image_tarball) + load = subprocess.run(["docker", "load", "-i", str(tmp_tar)], check=True, capture_output=True, text=True) + assert image_tag in (load.stdout + load.stderr) + + run = subprocess.run(["docker", "run", "--rm", image_tag], check=True, capture_output=True, text=True) + assert run.stdout.strip() == "docker-build-ok" + finally: + tmp_tar.unlink(missing_ok=True) + subprocess.run(["docker", "image", "rm", "-f", image_tag], check=False, capture_output=True, text=True) diff --git a/tests/integration/contrib/test_local_runtime_integration.py b/tests/integration/contrib/test_local_runtime_integration.py new file mode 100644 index 0000000..d231855 --- /dev/null +++ b/tests/integration/contrib/test_local_runtime_integration.py @@ -0,0 +1,595 @@ +from __future__ import annotations + +import json +import os +import time +from pathlib import Path +from typing import Any, cast + +import pytest + +from daggerml._internal.types import DmlRepoError, Runnable, Uri +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import AdapterBase, LocalAdapter +from daggerml.contrib.executors import ScriptExecutor +from daggerml.contrib.s3 import S3Store + +pytestmark = pytest.mark.slow + + +@pytest.fixture(autouse=True) +def _reset_registries(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + + class EchoExecutor: + name = "echo" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, *, cache_key, execution_id, state, execution_status, cancel_requested_by, runnable, argv_ptr, remote + ): + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + ereg.register_executor(EchoExecutor) + ereg.register_executor(ScriptExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +def _remote() -> dict[str, str]: + return {"root": os.environ["DML_REMOTE_ROOT"]} + + +def _mk_argv_ptr(*args: Any, argv0: Any | None = None) -> str: + from daggerml import new + from daggerml._internal.dml import make_index_ops, with_db + from tests import temporary_dml + + with temporary_dml() as dml: + dag = new(dml=dml, name="argv-src", message="argv-src") + index_ref = dag._require_index_ref() + head = argv0 if argv0 is not None else Runnable(target=Uri("daggerml:list"), kwargs={}, adapter="") + with with_db(dml) as db: + index_ops = make_index_ops(db, dml) + fn_ref = index_ops.put_literal(index_ref, head) + arg_refs = [index_ops.put_literal(index_ref, value) for value in args] + with index_ops._tx(readonly=False) as txn: + argv_ref = index_ops._prepare_fn(index_ref, [fn_ref, *arg_refs], {}, txn) + return index_ops._remote_ops().put_ref_manifest(argv_ref) + + +def _poll_until_terminal( + *, runnable: Runnable, argv_ptr: str, cache_key: str, initial_state: dict[str, Any] | None = None +) -> dict[str, Any]: + execution_id = f"exec-{cache_key}" + state: dict[str, Any] | None = initial_state + for _ in range(200): + result = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=_remote(), + state=state, + execution_status=None, + cancel_requested_by=None, + ) + if state is None and result.get("status") == "running": + state = cast(dict[str, Any], result.get("state")) + if result["status"] in {"succeeded", "failed"}: + return cast(dict[str, Any], result) + time.sleep(0.01) + pytest.fail("script executor did not reach terminal state") + + +def _mk_script_runnable(script: str, *, fn_name: str = "fn", call_kwargs: dict[str, Any] | None = None) -> Runnable: + uri = S3Store().put(data=script.encode("utf-8"), suffix=".py") + return Runnable( + target=Uri("script"), + adapter="dml-local-adapter", + kwargs={ + "__dml_script_exec__": {"prepop": {}, "fn_name": fn_name, "script_uri": uri.uri}, + **dict(call_kwargs or {}), + }, + ) + + +def test_local_adapter_resolve_runnable_shape(): + result = LocalAdapter.resolve_runnable("echo", {"x": 1}, None) + assert isinstance(result, Runnable) + assert isinstance(result.target, Uri) + assert result.target.uri == "echo" + assert result.kwargs == {"x": 1} + assert result.adapter == "dml-local-adapter" + + +def test_local_adapter_script_resolve_runnable_derives_call_kwargs_and_script(): + def fn(dag, x, y=2, *, z=3): + return x + y + z + + result = LocalAdapter.resolve_runnable("script", {"fn": fn, "prepop": {}}, None) + assert isinstance(result, Runnable) + assert result.target.uri == "script" + meta = result.kwargs["__dml_script_exec__"] + assert isinstance(meta["script_uri"], str) + assert meta["script_uri"].startswith("s3://test-bucket/test-prefix/data/") + assert meta["script_uri"].endswith(".py") + assert result.kwargs["y"] == 2 + assert result.kwargs["z"] == 3 + assert meta["fn_name"] == "fn" + assert "script" not in result.kwargs + assert "required_positional_count" not in result.kwargs + + +def test_local_adapter_script_resolve_runnable_rejects_unknown_kwargs(): + def fn(dag): + return None + + with pytest.raises(DmlRepoError, match="Unknown script executor kwargs"): + LocalAdapter.resolve_runnable("script", {"fn": fn, "call_kwargs": {}}, None) + + +def test_local_adapter_script_resolve_runnable_rejects_no_dag_param(): + def fn(): + return None + + with pytest.raises(DmlRepoError, match="must include first 'dag' parameter"): + LocalAdapter.resolve_runnable("script", {"fn": fn}, None) + + +def test_local_adapter_script_resolve_runnable_requires_dag_as_first_param(): + def fn(x, dag): + return x + + with pytest.raises(DmlRepoError, match="must include first 'dag' parameter"): + LocalAdapter.resolve_runnable("script", {"fn": fn}, None) + + +def test_local_adapter_script_resolve_runnable_requires_global_fn_definition(): + def _mk_fn(): + return lambda dag: None + + fn = _mk_fn() + with pytest.raises(DmlRepoError, match="not globally defined"): + resp = LocalAdapter.resolve_runnable("script", {"fn": fn}, None) + assert resp.kwargs["script"] == "" # for better error visibility + + +def test_local_adapter_script_resolve_runnable_rejects_sub_runnable(): + def fn(dag): + return None + + sub = Runnable(target=Uri("inner"), adapter="dml-local-adapter", kwargs={}, sub=None) + with pytest.raises(DmlRepoError, match="does not accept sub runnable"): + LocalAdapter.resolve_runnable("script", {"fn": fn}, sub) + + +@pytest.mark.parametrize( + "script,call_kwargs,inject_prepop,resume_once,expected_terminal", + [ + pytest.param( + "\n".join(["def fn(dag, x, y=2):", " return x.value() + y.value()", ""]), + {"y": 2}, + False, + False, + "succeeded", + id="LRT-LFC-001:kickoff-running-then-terminal-success", + ), + pytest.param( + "\n".join(["def fn(dag):", " return 'seed' in dag.keys()", ""]), + None, + True, + False, + "succeeded", + id="LRT-LFC-002:terminal-success-with-innermost-prepop", + ), + pytest.param( + "\n".join(["import time", "def fn(dag):", " time.sleep(0.2)", " return 1", ""]), + None, + False, + True, + "succeeded", + id="LRT-LFC-003:resume-poll-remains-running-before-terminal", + ), + pytest.param( + "\n".join(["def fn(dag):", " raise RuntimeError('boom')", ""]), + None, + False, + False, + "succeeded", + id="LRT-LFC-004:runtime-exception-path-terminal-envelope", + ), + ], +) +def test_script_executor_lifecycle_stage_matrix_LRT_LFC_001_to_LRT_LFC_004( + script, call_kwargs, inject_prepop, resume_once, expected_terminal, tmp_path +): + script = f"{script}\n# test-id:{tmp_path.name}\n" + runnable = _mk_script_runnable(script, call_kwargs=call_kwargs) + + if inject_prepop: + meta = cast(dict[str, Any], runnable.kwargs["__dml_script_exec__"]) + inner = Runnable( + target=Uri("inner"), + adapter="dml-local-adapter", + kwargs={ + "__dml_script_exec__": { + "prepop": {"seed": 9}, + "fn_name": cast(str, meta["fn_name"]), + "script_uri": cast(str, meta["script_uri"]), + } + }, + sub=None, + ) + outer = Runnable(target=Uri("outer"), adapter="dml-local-adapter", kwargs={}, sub=inner) + argv_ptr = _mk_argv_ptr(argv0=outer) + else: + argv_ptr = _mk_argv_ptr(3, argv0=runnable) if call_kwargs else _mk_argv_ptr(argv0=runnable) + + cache_key = ( + f"ck-stage-{expected_terminal}-" + f"{'resume' if resume_once else 'kickoff'}-" + f"{'prepop' if inject_prepop else 'plain'}" + ) + kickoff = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=f"exec-{cache_key}", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + assert kickoff["status"] == "running" + + if resume_once: + resumed = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=f"exec-{cache_key}", + remote=_remote(), + state=cast(dict[str, Any], kickoff["state"]), + execution_status=None, + cancel_requested_by=None, + ) + assert resumed["status"] == "running" + + result = _poll_until_terminal( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + initial_state=cast(dict[str, Any], resumed["state"]) if resume_once else cast(dict[str, Any], kickoff["state"]), + ) + assert result["status"] == expected_terminal + if expected_terminal == "succeeded": + assert result.get("error") is None + + +def test_script_executor_start_returns_running_with_job_state(): + script = "\n".join(["import time", "def fn(dag):", " time.sleep(0.2)", " return 1", ""]) + runnable = _mk_script_runnable(script) + cache_key = "ck-start-handle" + argv_ptr = _mk_argv_ptr(argv0=runnable) + remote = _remote() + + executor = ScriptExecutor() + result = executor.start( + cache_key=cache_key, + execution_id="exec-start-handle", + runnable=runnable, + argv_ptr=argv_ptr, + remote=remote, + ) + assert result["status"] == "running" + + job_state = cast(dict[str, Any], result["state"]) + assert isinstance(job_state.get("pid"), int) + assert isinstance(job_state.get("result_path"), str) + assert isinstance(job_state.get("stdout_path"), str) + assert isinstance(job_state.get("stderr_path"), str) + assert Path(cast(str, job_state["stdout_path"])).exists() + assert Path(cast(str, job_state["stderr_path"])).exists() + + from daggerml.contrib.executors.script import _cleanup_workdir + + _cleanup_workdir(job_state) + + +def test_script_executor_handles_worker_stdout_and_stderr(): + script = "\n".join( + [ + "import sys", + "def fn(dag):", + " sys.stdout.write('script-stdout\\n')", + " sys.stdout.flush()", + " sys.stderr.write('script-stderr\\n')", + " sys.stderr.flush()", + " return 1", + "", + ] + ) + runnable = _mk_script_runnable(script) + cache_key = "ck-mixed-worker-output" + argv_ptr = _mk_argv_ptr(argv0=runnable) + remote = _remote() + + executor = ScriptExecutor() + kickoff = executor.start( + cache_key=cache_key, + execution_id="exec-mixed-worker-output", + runnable=runnable, + argv_ptr=argv_ptr, + remote=remote, + ) + assert kickoff["status"] == "running" + + state = cast(dict[str, Any], kickoff["state"]) + result = executor.poll(cache_key=cache_key, execution_id="exec-mixed-worker-output", state=state, remote=remote) + while result["status"] == "running": + time.sleep(0.01) + result = executor.poll(cache_key=cache_key, execution_id="exec-mixed-worker-output", state=state, remote=remote) + + assert result["status"] == "succeeded" + + +def test_local_adapter_resolve_runnable_rejects_executor_for_other_adapter(): + class ForeignExecutor: + name = "foreign" + adapter = "lambda" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri("foreign"), adapter="x", kwargs={}, sub=None) + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + ereg.register_executor(ForeignExecutor) + with pytest.raises(DmlRepoError, match="is not registered for adapter 'local'"): + LocalAdapter.resolve_runnable("foreign", {}, None) + + +def test_local_adapter_dispatches_to_executor_lifecycle(): + seen: dict[str, Any] = {} + + class DispatchExecutorForTest: + name = "dispatch-test" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + seen["runnable"] = runnable + seen["argv_ptr"] = argv_ptr + seen["cache_key"] = cache_key + seen["execution_id"] = execution_id + seen["state"] = state + seen["remote"] = remote + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + ereg.register_executor(DispatchExecutorForTest) + runnable = Runnable(target=Uri("dispatch-test"), adapter="dml-local-adapter", kwargs={}) + payload = LocalAdapter.send( + runnable=runnable, + argv_ptr=_mk_argv_ptr("a"), + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + payload = cast(dict[str, Any], payload) + assert payload == {"status": "succeeded", "error": None, "dag_id": "a" * 64} + assert isinstance(seen["argv_ptr"], str) + assert seen["cache_key"] == "ck" + + +def test_local_adapter_unknown_executor_fails_deterministically(): + runnable = Runnable(target=Uri("missing"), adapter="dml-local-adapter", kwargs={}) + with pytest.raises(DmlRepoError, match="Executor 'missing' is not registered for adapter 'local'"): + LocalAdapter.send( + runnable=runnable, + argv_ptr=_mk_argv_ptr(), + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + + +@pytest.mark.parametrize( + "contract_id,executor_name,executor_cls,argv_args,expected", + [ + pytest.param( + "LRT-ADP-001", + "echo", + None, + (1,), + {"status": "succeeded", "error": None, "dag_id": "a" * 64}, + id="LRT-ADP-001:kickoff-terminal-succeeded-passthrough", + ), + pytest.param( + "LRT-ADP-002", + "running", + "running", + (), + {"status": "running", "error": None, "state": {"token": "exec-ck"}}, + id="LRT-ADP-002:kickoff-running-passthrough", + ), + ], +) +def test_local_adapter_send_stage_matrix_LRT_ADP_001_to_LRT_ADP_002( + contract_id, executor_name, executor_cls, argv_args, expected +): + del contract_id + + if executor_cls == "running": + class RunningExecutor: + name = "running" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + return {"status": "running", "error": None, "state": {"token": execution_id}} + + ereg.register_executor(RunningExecutor) + + runnable = Runnable(target=Uri(executor_name), adapter="dml-local-adapter", kwargs={}) + result = LocalAdapter.send( + runnable=runnable, + argv_ptr=_mk_argv_ptr(*argv_args), + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + result = cast(dict[str, Any], result) + assert result == expected + + +def test_local_adapter_send_rejects_non_contract_payload(): + class BadExecutor: + name = "bad" + adapter = "local" + + @staticmethod + def resolve_runnable(uri, kwargs, sub): + return Runnable(target=Uri(uri), kwargs=dict(kwargs), sub=sub, adapter="dml-local-adapter") + + @classmethod + def handle( + cls, + *, + cache_key, + execution_id, + state, + execution_status, + cancel_requested_by, + runnable, + argv_ptr, + remote, + ): + return {"status": "running", "error": None, "state": {"token": execution_id}, "extra": 1} + + ereg.register_executor(BadExecutor) + runnable = Runnable(target=Uri("bad"), adapter="dml-local-adapter", kwargs={}) + with pytest.raises(DmlRepoError, match="Adapter output"): + LocalAdapter.send( + runnable=runnable, + argv_ptr=_mk_argv_ptr(), + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + execution_status=None, + cancel_requested_by=None, + ) + + +def test_adapter_base_cli_reads_stdin_and_writes_stdout(capsys): + class DummyAdapter(AdapterBase): + @classmethod + def send( + cls, *, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by + ): + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + with pytest.MonkeyPatch.context() as mp: + mp.setattr( + "sys.stdin.read", + lambda: DummyAdapter._dump_payload( + runnable=Runnable(target=Uri("x"), adapter="dummy", kwargs={}), + argv_ptr="ptr", + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + ).decode("utf-8"), + ) + exit_code = DummyAdapter.cli([]) + + assert exit_code == 0 + assert json.loads(capsys.readouterr().out.strip()) == {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + +def test_adapter_base_cli_reads_and_writes_files(tmp_path): + class DummyAdapter(AdapterBase): + @classmethod + def send( + cls, *, runnable, argv_ptr, cache_key, execution_id, remote, state, execution_status, cancel_requested_by + ): + return {"status": "succeeded", "error": None, "dag_id": "a" * 64} + + in_file = tmp_path / "in.json" + out_file = tmp_path / "out.json" + in_file.write_bytes( + DummyAdapter._dump_payload( + runnable=Runnable(target=Uri("x"), adapter="dummy", kwargs={}), + argv_ptr="ptr", + cache_key="ck", + execution_id="exec-ck", + remote=_remote(), + state=None, + ) + ) + + exit_code = DummyAdapter.cli(["-i", str(in_file), "-o", str(out_file)]) + assert exit_code == 0 + assert json.loads(out_file.read_text()) == {"status": "succeeded", "error": None, "dag_id": "a" * 64} diff --git a/tests/integration/contrib/test_s3_store_integration.py b/tests/integration/contrib/test_s3_store_integration.py new file mode 100644 index 0000000..7ddb6d9 --- /dev/null +++ b/tests/integration/contrib/test_s3_store_integration.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import io +import stat +import sys +import tarfile +from pathlib import Path + +import pytest + +from daggerml import Uri +from daggerml._internal.types import DmlRepoError +from daggerml.contrib.s3 import S3Store, is_s3_uri + +pytestmark = pytest.mark.slow + + +def test_s3_store_default_uses_remote_root_data_prefix(): + store = S3Store() + uri = store.put(data=b"hello") + assert isinstance(uri, Uri) + assert uri.uri.startswith("s3://test-bucket/test-prefix/data/") + + +def test_s3_store_parse_uri_and_name_to_uri(): + store = S3Store(bucket="test-bucket", prefix="base") + assert store.parse_uri("s3://other/key") == ("other", "key") + assert store.parse_uri("x") == ("test-bucket", "base/x") + assert store._name2uri("x") == Uri("s3://test-bucket/base/x") + + +def test_s3_store_put_get_exists_ls_rm_roundtrip(): + store = S3Store() + uri = store.put(data=b"abc", suffix=".txt") + assert isinstance(uri, Uri) + assert store.exists(uri) is True + assert store.get(uri) == b"abc" + listed = store.ls(recursive=True) + assert uri in listed + store.rm(uri) + assert store.exists(uri) is False + + +def test_s3_store_put_js_get_js_roundtrip(): + store = S3Store() + uri = store.put_js({"b": 2, "a": 1}) + assert isinstance(uri, Uri) + assert uri.uri.endswith(".json") + assert store.get_js(uri) == {"a": 1, "b": 2} + + +def test_s3_store_tar_and_untar(tmp_path): + src = tmp_path / "src" + src.mkdir() + (src / "a.txt").write_text("A") + (src / "run.sh").write_text("#!/bin/sh\necho hi\n") + (src / "b.tmp").write_text("TMP") + (src / "run.sh").chmod(0o755) + + store = S3Store() + tar_uri = store.tar(src, excludes=["*.tmp"]) + out = tmp_path / "out" + store.untar(tar_uri, out) + + assert (out / "a.txt").read_text() == "A" + assert not (out / "b.tmp").exists() + src_mode = (src / "run.sh").stat().st_mode + out_mode = (out / "run.sh").stat().st_mode + assert bool(src_mode & stat.S_IXUSR) == bool(out_mode & stat.S_IXUSR) + assert bool(src_mode & stat.S_IXGRP) == bool(out_mode & stat.S_IXGRP) + assert bool(src_mode & stat.S_IXOTH) == bool(out_mode & stat.S_IXOTH) + + +def test_s3_store_tar_excludes_directory_descendants(tmp_path): + src = tmp_path / "src" + (src / ".venv" / "bin").mkdir(parents=True) + (src / "keep.txt").write_text("keep") + (src / ".venv" / "bin" / "python").write_text("skip") + + store = S3Store() + tar_uri = store.tar(src, excludes=[".venv"]) + out = tmp_path / "out" + store.untar(tar_uri, out) + + assert (out / "keep.txt").read_text() == "keep" + assert not (out / ".venv").exists() + + +def test_s3_store_tar_skips_absolute_symlinks_under_excluded_directory(tmp_path): + src = tmp_path / "src" + (src / ".venv" / "bin").mkdir(parents=True) + (src / "keep.txt").write_text("keep") + (src / ".venv" / "bin" / "python").symlink_to(Path(sys.executable)) + + store = S3Store() + tar_uri = store.tar(src, excludes=[".venv"]) + out = tmp_path / "out" + store.untar(tar_uri, out) + + assert (out / "keep.txt").read_text() == "keep" + assert not (out / ".venv").exists() + + +def test_s3_store_tar_raises_on_non_excluded_symlink_by_default(tmp_path): + src = tmp_path / "src" + src.mkdir() + (src / "keep.txt").write_text("keep") + (src / "link.txt").symlink_to(src / "keep.txt") + + store = S3Store() + with pytest.raises(DmlRepoError, match="symlinks='raise'"): + store.tar(src) + + +def test_s3_store_tar_ignores_non_excluded_symlink_when_requested(tmp_path): + src = tmp_path / "src" + src.mkdir() + (src / "keep.txt").write_text("keep") + (src / "link.txt").symlink_to(src / "keep.txt") + + store = S3Store() + tar_uri = store.tar(src, symlinks="ignore") + out = tmp_path / "out" + store.untar(tar_uri, out) + + assert (out / "keep.txt").read_text() == "keep" + assert not (out / "link.txt").exists() + + +def test_s3_store_cd_rebases_prefix(): + store = S3Store(bucket="test-bucket", prefix="a/b") + next_store = store.cd("c") + assert next_store.prefix.endswith("a/b/c") + + +def test_is_s3_uri_validation_matrix(): + assert is_s3_uri("s3://bucket/key") is True + assert is_s3_uri("s3://bucket/dir/key.py") is True + assert is_s3_uri("s3://bucket") is False + assert is_s3_uri("https://bucket/key") is False + assert is_s3_uri("") is False + + +def test_s3_store_requires_remote_root_or_explicit_bucket(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + monkeypatch.delenv("DML_REMOTE_ROOT", raising=False) + with pytest.raises(DmlRepoError, match="requires configured remote.root"): + S3Store() + + +def test_s3_store_tar_is_reproducible_for_tests_assets(): + store = S3Store() + assets_dir = Path(__file__).resolve().parents[2] / "assets" + first = store.tar(assets_dir) + second = store.tar(assets_dir) + assert first.uri == second.uri + + +def test_s3_store_untar_rejects_path_traversal_by_default(tmp_path): + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w") as tf: + payload = b"owned" + info = tarfile.TarInfo(name="../escape.txt") + info.size = len(payload) + tf.addfile(info, io.BytesIO(payload)) + + store = S3Store() + tar_uri = store.put(data=buf.getvalue(), suffix=".tar") + out = tmp_path / "out" + with pytest.raises(DmlRepoError, match="outside destination"): + store.untar(tar_uri, out) + assert not (tmp_path / "escape.txt").exists() + + +def test_s3_store_untar_allows_unsafe_extract_when_explicit(tmp_path): + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w") as tf: + payload = b"owned" + info = tarfile.TarInfo(name="../escape.txt") + info.size = len(payload) + tf.addfile(info, io.BytesIO(payload)) + + store = S3Store() + tar_uri = store.put(data=buf.getvalue(), suffix=".tar") + out = tmp_path / "out" + store.untar(tar_uri, out, unsafe=True) + assert (tmp_path / "escape.txt").read_bytes() == b"owned" diff --git a/tests/integration/contrib/test_ssh_integration.py b/tests/integration/contrib/test_ssh_integration.py new file mode 100644 index 0000000..8889a71 --- /dev/null +++ b/tests/integration/contrib/test_ssh_integration.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +import getpass +import logging +import os +import shlex +import shutil +import socket +import subprocess +import sys +import time +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import Any, cast + +import pytest + +from daggerml import Uri, new +from daggerml._internal.dml import make_index_ops, with_db +from daggerml._internal.types import Runnable +from daggerml.contrib import adapter_registry as areg +from daggerml.contrib import api +from daggerml.contrib import executor_registry as ereg +from daggerml.contrib.adapters import LocalAdapter +from daggerml.contrib.executors import ScriptExecutor, SshExecutor +from tests import temporary_dml + +pytestmark = pytest.mark.slow + +logger = logging.getLogger(__name__) + + +def _require_ssh_tools() -> None: + missing = [name for name in ("ssh", "sshd", "ssh-keygen") if shutil.which(name) is None] + if missing: + pytest.skip(f"missing ssh tools: {', '.join(missing)}") + + +@pytest.fixture(autouse=True) +def _reset_registries(tmp_path, monkeypatch): + areg._reset_for_tests() + ereg._reset_for_tests() + monkeypatch.setenv("DML_TEST_FN_STATE_DIR", str(tmp_path / "state")) + areg.register_adapter(LocalAdapter) + ereg.register_executor(ScriptExecutor) + ereg.register_executor(SshExecutor) + yield + areg._reset_for_tests() + ereg._reset_for_tests() + + +@pytest.fixture +def local_sshd(): + _require_ssh_tools() + sshd_proc = None + with TemporaryDirectory(prefix="daggerml-ssh-test-") as tmpd: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + + host_key_path = Path(tmpd) / "ssh_host_ed25519_key" + client_key_path = Path(tmpd) / "client_ed25519_key" + authorized_keys_path = Path(tmpd) / "authorized_keys" + sshd_config_path = Path(tmpd) / "sshd_config" + pid_file = Path(tmpd) / "sshd.pid" + + subprocess.run(["ssh-keygen", "-q", "-t", "ed25519", "-N", "", "-f", str(host_key_path)], check=True) + subprocess.run(["ssh-keygen", "-q", "-t", "ed25519", "-N", "", "-f", str(client_key_path)], check=True) + + shutil.copyfile(client_key_path.with_suffix(".pub"), authorized_keys_path) + authorized_keys_path.chmod(0o600) + + sshd_config_path.write_text( + dedent( + f""" + Port {port} + ListenAddress 127.0.0.1 + HostKey {host_key_path} + PidFile {pid_file} + LogLevel VERBOSE + StrictModes no + PasswordAuthentication no + KbdInteractiveAuthentication no + ChallengeResponseAuthentication no + PubkeyAuthentication yes + AuthorizedKeysFile {authorized_keys_path} + UsePAM no + PermitRootLogin no + """ + ).strip() + + "\n" + ) + + sshd_path = shutil.which("sshd") + assert sshd_path is not None + sshd_proc = subprocess.Popen( + [sshd_path, "-D", "-e", "-f", str(sshd_config_path)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + flags = [ + "-i", + str(client_key_path), + "-p", + str(port), + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "IdentitiesOnly=yes", + ] + logger.debug( + "starting local sshd tmpdir=%s port=%s host_key=%s client_key=%s", + tmpd, + port, + host_key_path, + client_key_path, + ) + + deadline = time.time() + 5.0 + while time.time() < deadline: + if sshd_proc.poll() is not None: + stdout, stderr = sshd_proc.communicate(timeout=1) + pytest.skip( + "local sshd failed to start:\n" + f"stdout: {stdout.decode(errors='replace')}\n" + f"stderr: {stderr.decode(errors='replace')}" + ) + try: + with socket.create_connection(("127.0.0.1", port), timeout=0.25): + break + except OSError: + time.sleep(0.1) + else: + sshd_proc.terminate() + pytest.skip("timeout waiting for local sshd to start") + + try: + logger.debug("local sshd ready host=%s flags=%s", f"{getpass.getuser()}@127.0.0.1", flags) + yield flags, f"{getpass.getuser()}@127.0.0.1" + finally: + sshd_proc.terminate() + try: + sshd_proc.wait(timeout=5) + except subprocess.TimeoutExpired: + sshd_proc.kill() + logger.debug("local sshd stopped") + + +@pytest.fixture +def ssh_resource_data(local_sshd, tmp_path): + flags, host = local_sshd + env_file = tmp_path / "ssh.env" + remote_state_dir = tmp_path / "remote-state" + remote_state_dir.mkdir() + aws_exports = "\n".join( + f"export {name}={shlex.quote(value)}" for name, value in sorted(os.environ.items()) if name.startswith("AWS_") + ) + # required for gh actions sanitize task + sanitizer_exports = "\n".join( + f"export {name}={shlex.quote(os.environ[name])}" + for name in ("LD_PRELOAD", "ASAN_OPTIONS", "UBSAN_OPTIONS") + if name in os.environ + ) + env_file.write_text( + dedent( + f""" + export DML_TEST_FN_STATE_DIR={shlex.quote(str(remote_state_dir))} + export PATH={shlex.quote(str(Path(sys.executable).parent))}:$PATH + export DML_TEST_SSH_VALUE=ssh-ok + {aws_exports} + {sanitizer_exports} + """ + ).strip() + + "\n" + ) + return {"host": host, "flags": flags, "env_files": [str(env_file)]} + + +def _remote() -> dict[str, str]: + return {"root": os.environ["DML_REMOTE_ROOT"]} + + +def _mk_argv_ptr(*args: Any, argv0: Any | None = None) -> str: + with temporary_dml() as dml: + dag = new(dml=dml, name="argv-src", message="argv-src") + index_ref = dag._require_index_ref() + head = argv0 if argv0 is not None else Runnable(target=Uri("daggerml:list"), kwargs={}, adapter="") + with with_db(dml) as db: + index_ops = make_index_ops(db, dml) + fn_ref = index_ops.put_literal(index_ref, head) + arg_refs = [index_ops.put_literal(index_ref, value) for value in args] + with index_ops._tx(readonly=False) as txn: + argv_ref = index_ops._prepare_fn(index_ref, [fn_ref, *arg_refs], {}, txn) + return index_ops._remote_ops().put_ref_manifest(argv_ref) + + +def _poll_until_terminal(*, runnable: Runnable, argv_ptr: str, cache_key: str) -> dict[str, Any]: + execution_id = f"exec-{cache_key}" + state: dict[str, Any] | None = None + for _ in range(200): + result = LocalAdapter.send( + runnable=runnable, + argv_ptr=argv_ptr, + cache_key=cache_key, + execution_id=execution_id, + remote=_remote(), + state=state, + ) + logger.debug( + "ssh integration poll cache_key=%s execution_id=%s status=%s error=%r state=%r", + cache_key, + execution_id, + result.get("status"), + result.get("error"), + result.get("state"), + ) + if state is None and result.get("status") == "running": + state = cast(dict[str, Any], result.get("state")) + if result["status"] in {"succeeded", "failed"}: + return cast(dict[str, Any], result) + time.sleep(0.05) + pytest.fail("ssh executor did not reach terminal state") + + +def test_ssh_executor_integration_runs_script_over_local_sshd(ssh_resource_data): + decorate = api.funkify(uri="ssh", adapter="local", **ssh_resource_data) + + @decorate + @api.funkify(uri="script", adapter="local") + def fn(dag): + import os + + return os.environ["DML_TEST_SSH_VALUE"] + + with temporary_dml() as dml: + dag = new(dml=dml, name="ssh-int", message="ssh-int") + runnable = cast(Runnable, dag.put(cast(Any, fn)).value()) + + argv_ptr = _mk_argv_ptr(argv0=runnable) + result = _poll_until_terminal(runnable=runnable, argv_ptr=argv_ptr, cache_key="ck-ssh-int-success") + assert result["status"] == "succeeded" + assert result["error"] is None + assert isinstance(result.get("dag_id"), str) diff --git a/tests/integration/contrib/test_supervisor_integration.py b/tests/integration/contrib/test_supervisor_integration.py new file mode 100644 index 0000000..e8a6446 --- /dev/null +++ b/tests/integration/contrib/test_supervisor_integration.py @@ -0,0 +1,338 @@ +from __future__ import annotations + +import json +import os +import sys +import tempfile +from pathlib import Path +from typing import Any +from uuid import uuid4 + +import boto3 +import pytest + +from daggerml._internal.types import DmlRepoError +from daggerml.contrib.supervisor import ( + _CLOUDWATCH_EVENT_OVERHEAD_BYTES, + _CLOUDWATCH_MAX_BATCH_BYTES, + _CLOUDWATCH_MAX_MESSAGE_BYTES, + _CloudWatchStream, + _parse_cmd_payload, + _validate_output, + run, +) + +pytestmark = pytest.mark.slow + +REAL_DAG_ID = "d" * 64 + + +def _cmd_payload(cmd: Any) -> dict[str, Any]: + return { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": cmd, + "remote": {"root": "s3://bucket/root"}, + } + + +def _log_messages(*, cache_key: str, stream_kind: str) -> list[str]: + client = boto3.client("logs", endpoint_url=os.environ["AWS_ENDPOINT_URL"]) + response = client.get_log_events(logGroupName="dml", logStreamName=f"/run/{cache_key}/{stream_kind}") + return [event["message"] for event in response["events"]] + + +@pytest.mark.parametrize("bad_cmd", [None, [], [1], [""], "python -m mod"]) # type: ignore[list-item] +def test_supervisor_launch_rejects_malformed_cmd(bad_cmd): + with pytest.raises(DmlRepoError, match=r"cmd must be a non-empty list\[str\]"): + _parse_cmd_payload(_cmd_payload(bad_cmd)) + + +def test_supervisor_payload_rejects_unknown_top_level_fields(): + payload = _cmd_payload([sys.executable, "-c", "pass"]) + payload["extra"] = "nope" + with pytest.raises(DmlRepoError, match=r"unknown fields: extra"): + _parse_cmd_payload(payload) + + +def test_supervisor_payload_rejects_unknown_remote_fields(): + payload = _cmd_payload([sys.executable, "-c", "pass"]) + payload["remote"] = {"root": "s3://bucket/root", "cache": "cache-ns"} + with pytest.raises(DmlRepoError, match=r"remote has unknown fields: cache"): + _parse_cmd_payload(payload) + + +def test_supervisor_validate_output_requires_dag_id_on_success(): + with pytest.raises(DmlRepoError, match=r"dag_id"): + _validate_output({"status": "succeeded", "error": None}) + + +def test_supervisor_validate_output_rejects_running_only_status_after_worker_exit(): + with pytest.raises(DmlRepoError, match=r"succeeded\|failed"): + _validate_output({"status": "running", "error": None, "state": {}}) + + +def test_supervisor_validate_output_rejects_canceled_status(): + with pytest.raises(DmlRepoError, match=r"succeeded\|failed"): + _validate_output({"status": "canceled", "error": None}) + + +def test_supervisor_run_succeeds_when_worker_writes_result(): + result_json = json.dumps({"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID}) + script = ( + f"import pathlib; pathlib.Path('result.json').write_text({result_json!r})" + ) + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + result = run(payload) + assert result == {"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID} + + +def test_supervisor_run_returns_failed_when_worker_exits_without_result(): + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", "pass"], + "remote": {"root": "s3://bucket/root"}, + } + result = run(payload) + assert result["status"] == "failed" + assert "result" in result["error"].lower() or "code" in result["error"].lower() + + +def test_supervisor_run_returns_failed_when_worker_writes_running_status(): + script = "import pathlib; pathlib.Path('result.json').write_text('{\"status\":\"running\",\"error\":null}')" + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + result = run(payload) + assert result["status"] == "failed" + assert "succeeded|failed" in result["error"] + + +def test_supervisor_run_returns_failed_when_worker_crashes(): + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", "raise RuntimeError('boom')"], + "remote": {"root": "s3://bucket/root"}, + } + result = run(payload) + assert result["status"] == "failed" + + +def test_supervisor_run_returns_failed_result_from_worker(): + script = ( + "import pathlib; " + "pathlib.Path('result.json').write_text(" + "'{\"status\":\"failed\",\"error\":\"worker error\"}'" + ")" + ) + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + result = run(payload) + assert result == {"status": "failed", "error": "worker error"} + + +def test_supervisor_streams_stdout_and_stderr_to_cloudwatch_and_local_files(monkeypatch): + seen_paths: dict[str, str] = {} + real_mkdtemp = tempfile.mkdtemp + + def capture_mkdtemp(*args, **kwargs): + path = real_mkdtemp(*args, **kwargs) + seen_paths["workdir"] = path + return path + + monkeypatch.setattr("daggerml.contrib.supervisor.tempfile.mkdtemp", capture_mkdtemp) + cache_key = f"cache:key:{uuid4()}" + script = "\n".join( + [ + "import pathlib, sys, time", + "sys.stdout.write('stdout-line\\n')", + "sys.stdout.flush()", + "time.sleep(0.05)", + "sys.stderr.write('stderr-line\\n')", + "sys.stderr.flush()", + "pathlib.Path('result.json').write_text(" + f"{json.dumps({'status': 'succeeded', 'error': None, 'dag_id': REAL_DAG_ID})!r}" + ")", + "", + ] + ) + payload = { + "version": 0, + "cache_key": cache_key, + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + + result = run(payload) + + assert result == {"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID} + workdir = Path(seen_paths["workdir"]) + assert workdir.joinpath("stdout.log").read_text() == "stdout-line\n" + assert workdir.joinpath("stderr.log").read_text() == "stderr-line\n" + + stdout_messages = _log_messages(cache_key=cache_key, stream_kind="stdout") + stderr_messages = _log_messages(cache_key=cache_key, stream_kind="stderr") + assert json.loads(stdout_messages[0]) == { + "cache_key": cache_key, + "event": "stream_start", + "execution_id": payload["execution_id"], + "stream": "stdout", + } + assert stdout_messages[1] == "stdout-line\n" + assert json.loads(stdout_messages[-1]) == { + "cache_key": cache_key, + "event": "stream_end", + "execution_id": payload["execution_id"], + "stream": "stdout", + "terminal_status": "succeeded", + } + assert stderr_messages[1] == "stderr-line\n" + assert json.loads(stderr_messages[-1]) == { + "cache_key": cache_key, + "event": "stream_end", + "execution_id": payload["execution_id"], + "stream": "stderr", + "terminal_status": "succeeded", + } + + +def test_supervisor_cloudwatch_delivery_failure_is_non_fatal(monkeypatch, caplog): + class FailingLogsClient: + def create_log_group(self, **kwargs): + return None + + def create_log_stream(self, **kwargs): + return None + + def put_log_events(self, **kwargs): + raise RuntimeError("cw down") + + monkeypatch.setattr("daggerml.contrib.supervisor._create_logs_client", lambda: FailingLogsClient()) + result_json = json.dumps({"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID}) + script = ( + "import pathlib,sys; sys.stdout.write('hello\\n'); sys.stdout.flush(); " + f"pathlib.Path('result.json').write_text({result_json!r})" + ) + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + + with caplog.at_level("WARNING"): + result = run(payload) + + assert result == {"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID} + assert any("CloudWatch logging disabled" in message for message in caplog.messages) + + +def test_supervisor_cloudwatch_initialization_failure_is_non_fatal(monkeypatch, caplog): + monkeypatch.setattr( + "daggerml.contrib.supervisor._create_logs_client", + lambda: (_ for _ in ()).throw(RuntimeError("no cw")), + ) + result_json = json.dumps({"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID}) + script = ( + "import pathlib,sys; sys.stderr.write('hello\\n'); sys.stderr.flush(); " + f"pathlib.Path('result.json').write_text({result_json!r})" + ) + payload = { + "version": 0, + "cache_key": f"cache:key:{uuid4()}", + "execution_id": uuid4().hex, + "cmd": [sys.executable, "-c", script], + "remote": {"root": "s3://bucket/root"}, + } + + with caplog.at_level("WARNING"): + result = run(payload) + + assert result == {"status": "succeeded", "error": None, "dag_id": REAL_DAG_ID} + assert any("CloudWatch logging disabled" in message for message in caplog.messages) + + +def test_cloudwatch_stream_batches_events_before_delivery(): + calls: list[list[str]] = [] + + class RecordingLogsClient: + def create_log_group(self, **kwargs): + return None + + def create_log_stream(self, **kwargs): + return None + + def put_log_events(self, **kwargs): + calls.append([event["message"] for event in kwargs["logEvents"]]) + return {} + + stream = _CloudWatchStream(cache_key="ck-batch", execution_id="exec-batch", stream_kind="stdout") + stream._client = RecordingLogsClient() + stream._pending_events.clear() + stream._pending_bytes = 0 + + second_event_bytes = len("second".encode("utf-8")) + _CLOUDWATCH_EVENT_OVERHEAD_BYTES + large_message = "x" * (_CLOUDWATCH_MAX_BATCH_BYTES - _CLOUDWATCH_EVENT_OVERHEAD_BYTES - second_event_bytes + 1) + stream.emit(large_message) + stream.emit("second") + stream.close(terminal_status="succeeded") + + assert len(calls) == 2 + assert calls[0] == [large_message] + assert calls[1][0] == "second" + assert json.loads(calls[1][1])["event"] == "stream_end" + + +def test_cloudwatch_stream_splits_single_event_that_exceeds_size_limit(): + class RecordingLogsClient: + def __init__(self): + self.calls: list[list[str]] = [] + + def create_log_group(self, **kwargs): + return None + + def create_log_stream(self, **kwargs): + return None + + def put_log_events(self, **kwargs): + self.calls.append([event["message"] for event in kwargs["logEvents"]]) + return {} + + client = RecordingLogsClient() + stream = _CloudWatchStream(cache_key="ck-oversized", execution_id="exec-oversized", stream_kind="stdout") + stream._client = client + stream._pending_events.clear() + stream._pending_bytes = 0 + too_large_message = "x" * (_CLOUDWATCH_MAX_MESSAGE_BYTES + 1) + + stream.emit(too_large_message) + stream.emit("tail") + stream.close(terminal_status="succeeded") + + all_messages = [message for call in client.calls for message in call] + oversized_chunks = [message for message in all_messages if set(message) == {"x"}] + assert len(oversized_chunks) == 2 + assert "".join(oversized_chunks) == too_large_message + assert "tail" in all_messages diff --git a/tests/integration/internal/conftest.py b/tests/integration/internal/conftest.py new file mode 100644 index 0000000..344748d --- /dev/null +++ b/tests/integration/internal/conftest.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import shutil +import tempfile +from pathlib import Path + +import pytest + +from daggerml._internal._db import DmlDbEnv, DmlDbMapFullError +from daggerml._internal.ops.base_ops import BaseOps +from daggerml._internal.types import NAMESPACES + + +class TmpEnv(DmlDbEnv): + def clear_all(self): + while True: + try: + with self.tx(readonly=False) as txn: + for ns in NAMESPACES: + for obj, _ in txn.iter(ns): + txn.delete(obj) + db_path = Path(self.path) + repo_root = db_path.parent.parent if db_path.name == "db" and db_path.parent.name == ".dml" else db_path + shutil.rmtree(repo_root / ".dml", ignore_errors=True) + db_path.mkdir(parents=True, exist_ok=True) + return + except DmlDbMapFullError: + self.resize(self.get_size() * 2) + + +@pytest.fixture(scope="module") +def temp_bo(): + with tempfile.TemporaryDirectory() as temp_dir: + db_path = Path(temp_dir) / ".dml" / "db" + db_path.mkdir(parents=True, exist_ok=True) + db_env = TmpEnv.create(str(db_path), namespaces=sorted(NAMESPACES)) + try: + yield BaseOps(db_env) + finally: + db_env.clear_all() + db_env.close() diff --git a/tests/integration/internal/ops/conftest.py b/tests/integration/internal/ops/conftest.py new file mode 100644 index 0000000..c13947b --- /dev/null +++ b/tests/integration/internal/ops/conftest.py @@ -0,0 +1,14 @@ +from tests.contracts.internal.support.conftest_support import ( # noqa: F401 + _aws_server, + aws_server, + clear_envvars, + db, + integration_remote_ops, + integration_remote_ops_fn, + remote_ops, + s3, + temp_bo, + temp_bo_fn, + temp_db, + temp_db_fn, +) diff --git a/tests/integration/internal/ops/test_cache_integration.py b/tests/integration/internal/ops/test_cache_integration.py new file mode 100644 index 0000000..ff770e6 --- /dev/null +++ b/tests/integration/internal/ops/test_cache_integration.py @@ -0,0 +1,109 @@ +import os + +import pytest + +from daggerml._internal._db import Ref +from daggerml._internal.ops.cache import CacheOps +from daggerml._internal.types import ( + ArgvNode, + Dag, + DictDatum, + DmlRepoError, + KwargvNode, + ListDatum, + ScalarDatum, +) + +pytestmark = pytest.mark.slow + + +def _remote_root_from_env() -> str: + return os.environ["DML_REMOTE_ROOT"] + + +def _put_datum_hashed(temp_bo, data) -> Ref: + with temp_bo._tx(readonly=False) as txn: + return txn.put(ScalarDatum(data=data)) + + +def _put_argv_node_hashed(temp_bo, datum_ref: Ref) -> Ref: + with temp_bo._tx(readonly=False) as txn: + argv_datum_ref = txn.put(ListDatum(data=[datum_ref])) + argv_node_ref = txn.put(ArgvNode(value=argv_datum_ref)) + kwargv_datum_ref = txn.put(DictDatum(data={})) + txn.put(KwargvNode(value=kwargv_datum_ref)) + return argv_node_ref + + +def _cache_key_for_argv(temp_bo, argv_ref: Ref) -> str: + with temp_bo._tx(readonly=True) as txn: + return txn.get(argv_ref).datum_ref(txn).id() + + +def _put_dag_hashed(temp_bo, argv_ref: Ref | None = None) -> Ref: + with temp_bo._tx(readonly=False) as txn: + dag = Dag(nodes=[argv_ref] if argv_ref else [], names={}, result=None, argv=argv_ref) + return txn.put(dag) + + +def _new_ops(temp_bo) -> CacheOps: + return CacheOps(temp_bo._db, remote_root=_remote_root_from_env()) + + +def test_put_get_delete_roundtrip_remote(temp_bo, s3): + ops = _new_ops(temp_bo) + datum_ref = _put_datum_hashed(temp_bo, "value") + argv_ref = _put_argv_node_hashed(temp_bo, datum_ref) + dag_ref = _put_dag_hashed(temp_bo, argv_ref) + cache_key = _cache_key_for_argv(temp_bo, argv_ref) + + assert ops.get(argv_ref) is None + assert ops.put(dag_ref, execution_id="exec-1") == cache_key + remote_ops = ops._require_remote_context() + cache_ref_obj = remote_ops._decode_ref(remote_ops._remote_get_ref(f"cache/{cache_key}.json")) + assert cache_ref_obj["targets"] == {"dag": []} + assert ops.get(argv_ref) == dag_ref + assert ops.delete(argv_ref) is True + assert ops.get(argv_ref) is None + assert ops.delete(argv_ref) is False + + +def test_list_limit_and_clear_remote(temp_bo, s3): + ops = _new_ops(temp_bo) + ops.clear() + entries: list[tuple[str, Ref]] = [] + for i in range(3): + datum_ref = _put_datum_hashed(temp_bo, f"value-{i}") + argv_ref = _put_argv_node_hashed(temp_bo, datum_ref) + dag_ref = _put_dag_hashed(temp_bo, argv_ref) + cache_key = _cache_key_for_argv(temp_bo, argv_ref) + ops.put(dag_ref, execution_id=f"exec-{i}") + entries.append((cache_key, dag_ref)) + + limited = list(ops.list(limit=1)) + assert len(limited) == 1 + assert limited[0] in entries + + listed = list(ops.list()) + assert set(listed) == set(entries) + assert ops.clear() == 3 + assert list(ops.list()) == [] + assert ops.clear() == 0 + + +def test_requires_remote_context(temp_bo): + ops = CacheOps(temp_bo._db, remote_root="") + datum_ref = _put_datum_hashed(temp_bo, "value") + argv_ref = _put_argv_node_hashed(temp_bo, datum_ref) + dag_ref = _put_dag_hashed(temp_bo, argv_ref) + + with pytest.raises(DmlRepoError, match="Remote cache context required"): + ops.get(argv_ref) + with pytest.raises(DmlRepoError, match="Remote cache context required"): + ops.put(dag_ref, execution_id="exec-1") + with pytest.raises(DmlRepoError, match="Remote cache context required"): + list(ops.list()) + with pytest.raises(DmlRepoError, match="Remote cache context required"): + ops.delete(argv_ref) + with pytest.raises(DmlRepoError, match="Remote cache context required"): + ops.clear() diff --git a/tests/integration/internal/ops/test_commit_integration.py b/tests/integration/internal/ops/test_commit_integration.py new file mode 100644 index 0000000..6ebe2ce --- /dev/null +++ b/tests/integration/internal/ops/test_commit_integration.py @@ -0,0 +1,493 @@ +"""Comprehensive tests for commit.py module with real database integration.""" + +import pytest +from hypothesis import HealthCheck, assume, given, settings +from hypothesis import strategies as st + +from daggerml._internal.ops.commit import CommitOps +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import Commit, DmlRepoError, Tree +from tests.contracts.internal.support.test_db_support import _gen_ref +from tests.contracts.internal.test_types_contract import _commit_strategy, _tree_strategy + +pytestmark = pytest.mark.slow + + +class TestCommitOps: + """Test CommitOps functionality with mocks.""" + + @pytest.fixture + def ops(self, temp_db): + """Create CommitOps with stored head context.""" + ops = CommitOps(_db=temp_db) + # Create and store a complete context + tree = Tree(dags={"main": _gen_ref("dag")}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit = Commit(parents=[], tree=tree_ref, author="test_user", message="Initial commit") + commit_ref = txn.put(commit) + head_ops = HeadOps(_db=temp_db) + try: + head_ops.delete_branch("main") + except DmlRepoError: + pass + head_ops.create_branch("main", commit_ref) + return ops, "main" + + def test_list_commits_integration(self, ops): + """Test listing commit history with real database.""" + ops, _head_ref = ops + # Create a commit chain: A -> B -> C + tree = Tree(dags={"test": _gen_ref("dag")}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit_a = Commit(parents=[], tree=tree_ref, author="user", message="A") + commit_a_ref = txn.put(commit_a) + commit_b = Commit(parents=[commit_a_ref], tree=tree_ref, author="user", message="B") + commit_b_ref = txn.put(commit_b) + commit_c = Commit(parents=[commit_b_ref], tree=tree_ref, author="user", message="C") + commit_c_ref = txn.put(commit_c) + + # Test list without limit + commits = list(ops.list(commit_c_ref)) + assert len(commits) == 3 + assert commits == [commit_c_ref, commit_b_ref, commit_a_ref] + + # Test list with limit + limited_commits = list(ops.list(commit_c_ref, limit=2)) + assert len(limited_commits) == 2 + assert limited_commits == [commit_c_ref, commit_b_ref] + + def test_get_dag_integration(self, ops): + """Test get_dag with real database.""" + ops, _ = ops + # Create tree with multiple DAGs + dag1_ref = _gen_ref("dag") + dag2_ref = _gen_ref("dag") + tree = Tree(dags={"dag1": dag1_ref, "dag2": dag2_ref}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit = Commit(parents=[], tree=tree_ref, author="test", message="Test commit") + commit_ref = txn.put(commit) + + # Test getting existing DAG + result = ops.get_dag(commit_ref, "dag1") + assert result == dag1_ref + + # Test getting non-existent DAG + result = ops.get_dag(commit_ref, "nonexistent") + assert result is None + + def test_describe_integration(self, ops): + """Test describe returns stable commit metadata.""" + ops, _ = ops + dag_ref = _gen_ref("dag") + tree = Tree(dags={"dag1": dag_ref}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit = Commit(parents=[], tree=tree_ref, author="test", message="Test commit", dag=dag_ref) + commit_ref = txn.put(commit) + + info = ops.describe(commit_ref) + assert info["id"] == commit_ref.id() + assert info["tree"] == tree_ref + assert info["author"] == "test" + assert info["message"] == "Test commit" + assert info["dag"] == dag_ref + + def test_delete_dag_integration(self, ops): + """Test delete_dag with real database.""" + + ops, _branch_name = ops + branch = "main" + # Create tree with multiple DAGs and store as head + keep1_ref = _gen_ref("dag") + delete_ref = _gen_ref("dag") + keep2_ref = _gen_ref("dag") + original_tree = Tree(dags={"keep1": keep1_ref, "delete_me": delete_ref, "keep2": keep2_ref}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(original_tree) + commit = Commit(parents=[], tree=tree_ref, author="test_user", message="Original commit") + commit_ref = txn.put(commit) + HeadOps(_db=ops._db).update_branch_commit(branch, HeadOps(_db=ops._db).get_branch_commit(branch), commit_ref) + + # Test delete_dag operation (note: returns self for chaining now) + result = ops.delete_dag("delete_me", branch, "test_user") + assert result is ops + + # Get updated context to verify changes + with ops._tx(readonly=True) as txn: + ctx = txn.get_commit_ctx(HeadOps(_db=ops._db).get_branch_commit(branch)) + + # Verify DAG was removed from tree + assert "keep1" in ctx.tree.dags + assert "keep2" in ctx.tree.dags + assert "delete_me" not in ctx.tree.dags + + # Verify commit metadata + assert "Delete DAG 'delete_me'" in ctx.commit.message + assert ctx.commit.author == "test_user" + assert len(ctx.commit.parents) == 1 + + def test_delete_dag_updates_branch_after_new_commit_is_visible(self, ops, monkeypatch): + ops, _branch_name = ops + branch = "main" + original_tree = Tree(dags={"delete_me": _gen_ref("dag")}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(original_tree) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test_user", message="Original commit")) + head_ops = HeadOps(_db=ops._db) + head_ops.update_branch_commit(branch, head_ops.get_branch_commit(branch), commit_ref) + + seen = {} + original_update = HeadOps.update_branch_commit + + def _spy(self, branch_name, old_commit, new_commit, txn=None): + del txn + with ops._tx(readonly=True) as read_txn: + seen["exists"] = read_txn.exists(new_commit) + return original_update(self, branch_name, old_commit, new_commit) + + monkeypatch.setattr(HeadOps, "update_branch_commit", _spy) + + result = ops.delete_dag("delete_me", branch, "test_user") + + assert result is ops + assert seen["exists"] is True + + def test_topo_sort_integration(self, ops): + """Test _topo_sort with real commit chain.""" + ops, _ = ops + # Create a commit chain: A -> B -> C + tree = Tree(dags={"test": _gen_ref("dag")}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit_a = Commit(parents=[], tree=tree_ref, author="user", message="A") + commit_a_ref = txn.put(commit_a) + commit_b = Commit(parents=[commit_a_ref], tree=tree_ref, author="user", message="B") + commit_b_ref = txn.put(commit_b) + commit_c = Commit(parents=[commit_b_ref], tree=tree_ref, author="user", message="C") + commit_c_ref = txn.put(commit_c) + + # Test topological sort + sorted_refs = ops._topo_sort(commit_c_ref) + + # Should return [C, B, A] - descendants before ancestors + assert len(sorted_refs) == 3 + assert sorted_refs[0] == commit_c_ref + assert sorted_refs[1] == commit_b_ref + assert sorted_refs[2] == commit_a_ref + + def test_merge_base_integration(self, ops): + """Test _merge_base with real commit DAG.""" + ops, _ = ops + # Create commit DAG: A -> B -> D + # A -> C -> E + tree = Tree(dags={"test": _gen_ref("dag")}) + with ops._tx(readonly=False) as txn: + tree_ref = txn.put(tree) + commit_a = Commit(parents=[], tree=tree_ref, author="user", message="A") + commit_a_ref = txn.put(commit_a) + commit_b = Commit(parents=[commit_a_ref], tree=tree_ref, author="user", message="B") + commit_b_ref = txn.put(commit_b) + commit_c = Commit(parents=[commit_a_ref], tree=tree_ref, author="user", message="C") + commit_c_ref = txn.put(commit_c) + commit_d = Commit(parents=[commit_b_ref], tree=tree_ref, author="user", message="D") + commit_d_ref = txn.put(commit_d) + commit_e = Commit(parents=[commit_c_ref], tree=tree_ref, author="user", message="E") + commit_e_ref = txn.put(commit_e) + + # Test merge base + merge_base_ref = ops._merge_base(commit_d_ref, commit_e_ref) + + # Common ancestor should be A + assert merge_base_ref == commit_a_ref + + def test_diff_trees_integration(self, ops): + """Test _diff method with real trees.""" + ops, _ = ops + # Create two different trees + old_ref = _gen_ref("dag") + unique1_ref = _gen_ref("dag") + new_ref = _gen_ref("dag") + unique2_ref = _gen_ref("dag") + tree1 = Tree(dags={"common": old_ref, "only_in_1": unique1_ref}) + tree2 = Tree(dags={"common": new_ref, "only_in_2": unique2_ref}) + with ops._tx(readonly=False) as txn: + tree1_ref = txn.put(tree1) + tree2_ref = txn.put(tree2) + + # Test diff + with ops._tx(readonly=True) as txn: + diff_result = ops._diff(tree1_ref, tree2_ref, txn) + + # Check structure + assert "add" in diff_result + assert "rem" in diff_result + + # Tree2 additions: only_in_2 (new), common (changed value) + assert "only_in_2" in diff_result["add"] + assert diff_result["add"]["only_in_2"] == unique2_ref + assert "common" in diff_result["add"] + assert diff_result["add"]["common"] == new_ref + assert diff_result["rem"]["only_in_1"] == unique1_ref + assert diff_result["rem"]["common"] == old_ref + + def test_patch_trees_integration(self, ops): + """Test _patch method with real trees.""" + ops, _ = ops + # Create base tree + keep_ref = _gen_ref("dag") + old_ref = _gen_ref("dag") + remove_ref = _gen_ref("dag") + new_ref = _gen_ref("dag") + added_ref = _gen_ref("dag") + base_tree = Tree(dags={"keep": keep_ref, "modify": old_ref, "remove": remove_ref}) + patch_diff = {"add": {"modify": new_ref, "add": added_ref}, "rem": {"remove": remove_ref}} + + # Apply patch (outside of transaction since _patch creates its own) + with ops._tx(readonly=False) as txn: + base_tree_ref = txn.put(base_tree) + patched_tree_ref = ops._patch(base_tree_ref, patch_diff, txn=txn) + + # Verify patched tree + with ops._tx(readonly=True) as txn: + patched_tree = txn.get(patched_tree_ref) + assert isinstance(patched_tree, Tree) + assert "keep" in patched_tree.dags # Unchanged + assert "modify" in patched_tree.dags # Modified + assert patched_tree.dags["modify"] == new_ref + assert patched_tree.dags["keep"] == keep_ref + assert patched_tree.dags["add"] == added_ref + assert "remove" not in patched_tree.dags # Removed + + def test_merge_integration(self, ops): + """Test merge operation with real database.""" + ops, _ = ops + # Create base tree and commit (common ancestor) + original_ref = _gen_ref("dag") + base_tree = Tree(dags={"common": original_ref}) + with ops._tx(readonly=False) as txn: + base_tree_ref = txn.put(base_tree) + base_commit = Commit(parents=[], tree=base_tree_ref, author="base_user", message="Base commit") + base_commit_ref = txn.put(base_commit) + # Create first branch: modify common, add dag1 + branch1_ref = _gen_ref("dag") + unique1_ref = _gen_ref("dag") + tree1 = Tree(dags={"common": branch1_ref, "dag1": unique1_ref}) + tree1_ref = txn.put(tree1) + commit1 = Commit(parents=[base_commit_ref], tree=tree1_ref, author="user1", message="First branch") + commit1_ref = txn.put(commit1) + # Create second branch: modify common differently, add dag2 + branch2_ref = _gen_ref("dag") + unique2_ref = _gen_ref("dag") + tree2 = Tree(dags={"common": branch2_ref, "dag2": unique2_ref}) + tree2_ref = txn.put(tree2) + commit2 = Commit(parents=[base_commit_ref], tree=tree2_ref, author="user2", message="Second branch") + commit2_ref = txn.put(commit2) + + # Test merge operation + try: + merged_ref = ops.merge(commit1_ref, commit2_ref, "test_user") + + # If merge succeeds, verify result + with ops._tx(readonly=True) as txn: + merged_commit = txn.get(merged_ref) + assert isinstance(merged_commit, Commit) + assert len(merged_commit.parents) == 2 + assert commit1_ref in merged_commit.parents + assert commit2_ref in merged_commit.parents + assert merged_commit.author == "test_user" + + # Check merged tree + with ops._tx(readonly=True) as txn: + merged_tree = txn.get(merged_commit.tree) + assert isinstance(merged_tree, Tree) + # Both unique DAGs should be preserved + assert "dag1" in merged_tree.dags + assert "dag2" in merged_tree.dags + # Conflicted "common" will have one of the values + assert "common" in merged_tree.dags + + except DmlRepoError as e: + # Merge conflict is expected due to conflicting "common" DAG + assert "conflict" in str(e).lower() + + def test_merge_simple_case(self, ops): + """Controlled test for merge behavior when one branch adds a single DAG key. + + Hypothesis: map/key handling between C and Python may produce spurious + keys due to ownership/termination issues; this small controlled test + reproduces the earlier failing scenario deterministically. + """ + ops, _ = ops + # Base commit + base_tree = Tree(dags={}) + with ops._tx(readonly=False) as txn: + base_tree_ref = txn.put(base_tree) + base_commit = Commit(parents=[], tree=base_tree_ref, author="base", message="base") + base_commit_ref = txn.put(base_commit) + # t1 has a single DAG named '0' + t1 = Tree(dags={"0": _gen_ref("dag")}) + t1_ref = txn.put(t1) + c1 = Commit(parents=[base_commit_ref], tree=t1_ref, author="user1", message="c1") + c1_ref = txn.put(c1) + # t2 empty + t2 = Tree(dags={}) + t2_ref = txn.put(t2) + c2 = Commit(parents=[base_commit_ref], tree=t2_ref, author="user2", message="c2") + c2_ref = txn.put(c2) + + # Merge c1 and c2 with overwrite strategy + merged_ref = ops.merge(c1_ref, c2_ref, "test_user") + with ops._tx(readonly=True) as txn: + merged_commit = txn.get(merged_ref) + mtree = txn.get(merged_commit.tree) + # Expect merged tree to reflect overwrite (method semantics may vary), + # but ensure keys are valid Python strings and not corrupted types. + assert all(isinstance(k, str) for k in mtree.dags.keys()) + # Ensure no unexpected keys (either '0' present or not depending on method), + # but crucially avoid segfaults during access. + _ = list(mtree.dags.keys()) + + def test_rebase_integration(self, ops): + """Test rebase operation with real database.""" + ops, _ = ops + # Create base commit + base_ref = _gen_ref("dag") + target_ref = _gen_ref("dag") + source_ref = _gen_ref("dag") + base_tree = Tree(dags={"base": base_ref}) + target_tree = Tree(dags={"base": base_ref, "target": target_ref}) + source_tree = Tree(dags={"base": base_ref, "source": source_ref}) + with ops._tx(readonly=False) as txn: + base_tree_ref = txn.put(base_tree) + base_commit = Commit(parents=[], tree=base_tree_ref, author="base", message="Base") + base_commit_ref = txn.put(base_commit) + target_tree_ref = txn.put(target_tree) + target_commit = Commit(parents=[base_commit_ref], tree=target_tree_ref, author="target", message="Target") + target_commit_ref = txn.put(target_commit) + source_tree_ref = txn.put(source_tree) + source_commit = Commit(parents=[base_commit_ref], tree=source_tree_ref, author="source", message="Source") + source_commit_ref = txn.put(source_commit) + + # Test rebase operation + rebased_ref = ops.rebase(source_commit_ref, target_commit_ref, "test_user") + + # Verify rebase result + with ops._tx(readonly=True) as txn: + rebased_commit = txn.get(rebased_ref) + assert isinstance(rebased_commit, Commit) + assert rebased_commit.author == "test_user" + assert rebased_commit.message == "Source" # Preserves original message + assert len(rebased_commit.parents) == 1 + assert rebased_commit.parents[0] == target_commit_ref # New parent + + # Verify rebased tree combines changes + with ops._tx(readonly=True) as txn: + rebased_tree = txn.get(rebased_commit.tree) + assert isinstance(rebased_tree, Tree) + assert "base" in rebased_tree.dags + assert "source" in rebased_tree.dags # Source changes preserved + assert "target" in rebased_tree.dags # Target base included + + @given(st.lists(_tree_strategy(), min_size=2, max_size=5)) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + def test_multiple_commits_hypothesis(self, ops, tree_objs): + """Test creating and listing multiple commits with hypothesis.""" + ops, _ = ops + # Store all trees and create commit chain + commit_refs = [] + delete_refs = [] + prev_commit_ref = None + with ops._tx(readonly=False) as txn: + for i, tree_obj in enumerate(tree_objs): + tree_ref = txn.put(tree_obj) + delete_refs.append(tree_ref) + parents = [prev_commit_ref] if prev_commit_ref else [] + commit = Commit(parents=parents, tree=tree_ref, author=f"user_{i}", message=f"Commit {i}") + commit_ref = txn.put(commit) + commit_refs.append(commit_ref) + delete_refs.append(commit_ref) + prev_commit_ref = commit_ref + # Test listing commits from latest + if commit_refs: + listed_commits = list(ops.list(commit_refs[-1])) + assert len(listed_commits) == len(commit_refs) + # Should be in reverse order (newest first) + assert listed_commits == list(reversed(commit_refs)) + # Cleanup + with ops._tx(readonly=False) as txn: + for ref in set(delete_refs): + txn.delete(ref) + + @given(_commit_strategy(), _tree_strategy()) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + def test_get_dag_hypothesis(self, ops, commit_obj, tree_obj): + """Test get_dag with hypothesis-generated commits.""" + ops, _ = ops + # Store tree and commit + with ops._tx(readonly=False) as txn: + commit_obj.tree = txn.put(tree_obj) + commit_ref = txn.put(commit_obj) + delete_refs = [commit_ref, commit_obj.tree] + # Get the tree and test DAG retrieval + with ops._tx(readonly=True) as txn: + tree = txn.get(commit_obj.tree) + if tree.dags: + # Test getting existing DAG + dag_name = next(iter(tree.dags.keys())) + expected_dag_ref = tree.dags[dag_name] + + result = ops.get_dag(commit_ref, dag_name) + assert result == expected_dag_ref + + # Test getting non-existent DAG + result = ops.get_dag(commit_ref, "definitely_does_not_exist_12345") + assert result is None + with ops._tx(readonly=False) as txn: + for ref in set(delete_refs): + txn.delete(ref) + + @given( + _commit_strategy(), + _tree_strategy(), + _commit_strategy(), + _tree_strategy(), + _commit_strategy(), + _tree_strategy(), + ) + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) + def test_merge_real(self, ops, c0, t0, c1, t1, c2, t2): + """Test merge with hypothesis-generated commits.""" + ops, _ = ops + assume(set(t1.dags.keys()).isdisjoint(set(t2.dags.keys()))) + assume(set(t0.dags.keys()).isdisjoint(set([*t1.dags.keys(), *t2.dags.keys()]))) + # Store commits + with ops._tx(readonly=False) as txn: + c0.parents = [] + c0.tree = txn.put(t0) + commit_ref0 = txn.put(c0) + c1.parents = [commit_ref0] + c1.tree = txn.put(t1) + c2.parents = [commit_ref0] + c2.tree = txn.put(t2) + commit_ref1 = txn.put(c1) + commit_ref2 = txn.put(c2) + delete_refs = [commit_ref1, commit_ref2, c1.tree, c2.tree, commit_ref0, c0.tree] + # Test merge operation + merged_ref = ops.merge(commit_ref1, commit_ref2, "test_user") + delete_refs.append(merged_ref) + with ops._tx(readonly=True) as txn: + merged_commit = txn.get(merged_ref) + assert isinstance(merged_commit, Commit) + assert len(merged_commit.parents) == 2 + assert commit_ref1 in merged_commit.parents + assert commit_ref2 in merged_commit.parents + assert merged_commit.author == "test_user" + mtree = txn.get(merged_commit.tree) + # removes `c0` dags plus adds from `c1` and `c2` + assert set(mtree.dags.keys()) == set([*t1.dags.keys(), *t2.dags.keys()]) + with ops._tx(readonly=False) as txn: + for ref in set(delete_refs): + txn.delete(ref) diff --git a/tests/integration/internal/ops/test_head_integration.py b/tests/integration/internal/ops/test_head_integration.py new file mode 100644 index 0000000..3cea75a --- /dev/null +++ b/tests/integration/internal/ops/test_head_integration.py @@ -0,0 +1,145 @@ +"""Comprehensive tests for head.py module with real database integration.""" + +from pathlib import Path + +import pytest + +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.types import Commit, DmlRepoError, Tree + +pytestmark = pytest.mark.slow + + +class TestHeadOps: + """Test HeadOps functionality with mocks.""" + + def test_create_and_delete_branch_head_roundtrip(self, temp_bo): + """Test HeadOps initialization.""" + branch_name = "feature" + existing_branch = "main" + ops = HeadOps(temp_bo._db) + with temp_bo._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + cr0 = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="base")) + ops.create_branch(existing_branch, cr0) + ref = ops.create_branch(branch_name, cr0) + assert ref == branch_name + assert ops.get_branch_commit(ref) == cr0 + ops.delete_branch(ref) + ops.delete_branch(existing_branch) + with pytest.raises(DmlRepoError, match="Pointer does not exist"): + ops.get_branch_commit(ref) + with temp_bo._tx(readonly=False) as txn: + txn.delete(cr0) + + def test_list(self, temp_bo): + """Test HeadOps list method.""" + ops = HeadOps(temp_bo._db) + with temp_bo._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="base")) + head_names = ["main", "feature", "release_1"] + for head_name in head_names: + ops.create_branch(head_name, commit_ref) + listed_heads = ops.list_branches() + assert set(head_names).issubset(set(listed_heads)) + for head_name in head_names: + ops.delete_branch(head_name) + + def test_get_branch_commit(self, temp_bo): + with temp_bo._tx(readonly=False) as txn: + tree_ref = txn.put(Tree(dags={})) + commit_ref = txn.put(Commit(parents=[], tree=tree_ref, author="test", message="base")) + ops = HeadOps(temp_bo._db) + ops.create_branch("main", commit_ref) + assert ops.get_branch_commit("main") == commit_ref + ops.delete_branch("main") + + def test_bootstrap_branch_pointer_is_written_after_commit_is_visible(self, temp_bo, monkeypatch): + ops = HeadOps(temp_bo._db) + seen = {} + create_pointer = ops._create_pointer + + def _spy(pointer_path, commit_ref): + with temp_bo._tx(readonly=True) as txn: + seen["exists"] = txn.exists(commit_ref) + return create_pointer(pointer_path, commit_ref) + + monkeypatch.setattr(ops, "_create_pointer", _spy) + + branch = ops.create_branch("main") + + assert branch == "main" + assert seen["exists"] is True + assert ops.get_branch_commit(branch).ns() == "commit" + + def test_bootstrap_branch_with_caller_txn_is_rejected(self, temp_bo): + ops = HeadOps(temp_bo._db) + + with temp_bo._tx(readonly=False) as txn: + with pytest.raises(DmlRepoError, match="does not support caller-owned transactions"): + ops.create_branch("txn-main", txn=txn) + + def test_index_pointer_ops_do_not_require_live_commits(self, temp_bo): + ops = HeadOps(temp_bo._db) + missing_commit = ops.get_branch_commit(ops.create_branch("stale-main")) + with temp_bo._tx(readonly=False) as txn: + txn.delete(missing_commit) + + index_id = ops.create_index(missing_commit) + updated_commit = type(missing_commit)(f"commit:{'f' * 64}") + + assert ops.get_index_commit(index_id) == missing_commit + assert ops.update_index_commit(index_id, missing_commit, updated_commit) == updated_commit + assert ops.get_index_commit(index_id) == updated_commit + + ops.delete_index(index_id) + with pytest.raises(DmlRepoError, match="Pointer does not exist"): + ops.get_index_commit(index_id) + + def test_head_roundtrip_supports_attached_and_detached_payloads(self, temp_bo): + ops = HeadOps(temp_bo._db) + branch = ops.create_branch("feature") + branch_commit = ops.get_branch_commit(branch) + + ops.write_attached_head("feature") + attached = ops.get_head_state() + assert attached.mode == "attached" + assert attached.branch == "feature" + assert attached.commit == branch_commit + + ops.write_detached_head(branch_commit) + detached = ops.get_head_state() + assert detached.mode == "detached" + assert detached.branch is None + assert detached.commit == branch_commit + + def test_invalid_head_payload_fails_closed(self, temp_bo): + ops = HeadOps(temp_bo._db) + head_path = ops._head_path() + head_path.parent.mkdir(parents=True, exist_ok=True) + head_path.write_text("main\n") + + with pytest.raises(DmlRepoError, match="Invalid HEAD payload"): + ops.get_head_state() + + def test_attached_head_accepts_slash_and_dot_branch_names(self, temp_bo): + ops = HeadOps(temp_bo._db) + branch = ops.create_branch("topic/v1.0") + ops.write_attached_head(branch) + + state = ops.get_head_state() + + assert state.mode == "attached" + assert state.branch == "topic/v1.0" + assert "topic/v1.0" in ops.list_branches() + + def test_project_home_requires_dml_db_layout(self, temp_bo): + ops = HeadOps(temp_bo._db) + original_path = ops._db.path + ops._db.path = str(Path(temp_bo._db.path).parent.parent) + try: + with pytest.raises(DmlRepoError, match="Cannot resolve project home"): + ops.get_head_state() + finally: + ops._db.path = original_path diff --git a/tests/integration/internal/ops/test_remote_integration.py b/tests/integration/internal/ops/test_remote_integration.py new file mode 100644 index 0000000..bcf8530 --- /dev/null +++ b/tests/integration/internal/ops/test_remote_integration.py @@ -0,0 +1,2735 @@ +"""Tests for remote operations.""" + +import base64 +import hashlib +import json +import time +from unittest.mock import Mock, patch + +import pytest + +from daggerml._internal._db import Ref +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.remote import ( + InvalidManifest, + InvalidOid, + InvalidRef, + MissingCasObject, + RefAlreadyExists, + RemoteError, + RemoteOps, + ShaMismatch, +) +from daggerml._internal.types import Commit, DmlRepoError, Tree +from tests.contracts.internal.support.conftest_support import FakeDb, FakeTxn, remote_bucket_and_prefix_from_env + +pytestmark = pytest.mark.slow + + +def _put_remote_json(remote_ops, relative_key: str, value: dict) -> None: + remote_ops.client.put_object( + Bucket=remote_ops.bucket, + Key=remote_ops._prefixed_key(relative_key), + Body=json.dumps(value, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ContentType="application/json", + ) + + +class TestRemoteDescriptor: + """Tests for remote descriptor file handling.""" + + def test_ensure_descriptor_creates_when_missing(self, db, s3): + """Test that _ensure_remote_descriptor creates dml.json when missing.""" + bucket, prefix = remote_bucket_and_prefix_from_env() + descriptor_key = f"{prefix}/dml.json" if prefix else "dml.json" + + # Ensure descriptor doesn't exist initially + try: + s3.delete_object(Bucket=bucket, Key=descriptor_key) + except s3.exceptions.NoSuchKey: + pass # Already doesn't exist + + # Create RemoteOps instance, which should create the descriptor + RemoteOps( + _db=db, + client=s3, + bucket=bucket, + prefix=prefix, + ) + + # Verify descriptor was created + response = s3.get_object(Bucket=bucket, Key=descriptor_key) + descriptor = json.loads(response["Body"].read().decode("utf-8")) + + expected_descriptor = { + "schema": 0, + "hash": "sha256", + "layout": "cas+refs", + "refs_prefix": "refs", + "io_prefix": "io", + "cas_prefix": "cas/sha256", + } + assert descriptor == expected_descriptor + + def test_ensure_descriptor_validates_existing(self, db, s3): + """Test that _ensure_remote_descriptor fails on invalid existing descriptors.""" + bucket, prefix = remote_bucket_and_prefix_from_env() + invalid_prefix = f"{prefix}/invalid-descriptor-test" if prefix else "invalid-descriptor-test" + descriptor_key = f"{invalid_prefix}/dml.json" + + # Write an invalid descriptor + invalid_descriptor = { + "schema": 1, # Invalid schema + "hash": "md5", # Invalid hash + "layout": "cas-only", # Invalid layout + "refs_prefix": "references", # Invalid refs_prefix + "cas_prefix": "cas/md5", # Invalid cas_prefix + } + s3.put_object( + Bucket=bucket, + Key=descriptor_key, + Body=json.dumps(invalid_descriptor, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ContentType="application/json", + ) + + # Create RemoteOps, which should fail hard on invalid descriptor + with pytest.raises(DmlRepoError, match="Remote initialization failed"): + RemoteOps( + _db=db, + client=s3, + bucket=bucket, + prefix=invalid_prefix, + ) + + # Verify descriptor was not rewritten + response = s3.get_object(Bucket=bucket, Key=descriptor_key) + descriptor = json.loads(response["Body"].read().decode("utf-8")) + assert descriptor == invalid_descriptor + + +class TestFakeDb: + """Tests for FakeDb and FakeTxn implementations.""" + + def test_fake_db_tx_context_manager(self, db): + """Test that FakeDb.tx() returns a raw fake transaction.""" + with db.tx() as txn_ctx: + assert isinstance(txn_ctx, FakeTxn) + + def test_fake_txn_put_get_roundtrip(self, db): + """Test that FakeTxn can put and get values correctly.""" + test_ref = Ref("test:123") + test_value = {"key": "value"} + with db.tx() as txn_ctx: + # Put value + result_ref = txn_ctx.put(test_value, to=test_ref) + assert result_ref == test_ref + # Get value back + retrieved_value = txn_ctx.get(test_ref) + assert retrieved_value == test_value + + +class TestRemoteKeyMapping: + """Tests for remote key mapping helpers.""" + + def test_cas_key_sharding(self, remote_ops): + """Test CAS key generation with sharding for a known OID.""" + # Known OID from the spec + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + expected_key = "test-prefix/cas/sha256/01/23/0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + # Test with the prefix set by the fixture + result = remote_ops._cas_key(oid) + assert result == expected_key + + def test_cas_key_with_prefix(self, remote_ops, monkeypatch): + """Test CAS key generation with a different prefix.""" + monkeypatch.setenv("DML_REMOTE_ROOT", "s3://test-bucket/myrepo") + # Need to recreate remote_ops to pick up the new prefix + remote_ops_with_prefix = RemoteOps( + _db=remote_ops._db, + client=remote_ops.client, + bucket="test-bucket", + prefix="myrepo", + ) + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + expected_key = "myrepo/cas/sha256/01/23/0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + result = remote_ops_with_prefix._cas_key(oid) + assert result == expected_key + + def test_cas_key_invalid_oid(self, remote_ops): + """Test that _cas_key rejects invalid OIDs.""" + # Test non-hex characters + with pytest.raises(InvalidOid, match="Invalid OID"): + remote_ops._cas_key("gggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg") + # Test wrong length + with pytest.raises(InvalidOid, match="Invalid OID"): + remote_ops._cas_key("0123456789abcdef") + # Test uppercase + with pytest.raises(InvalidOid, match="Invalid OID"): + remote_ops._cas_key("0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF") + + def test_ref_key_joining(self, remote_ops): + """Test ref key generation joins prefix and ref path.""" + ref_path = "tags/main/abc123.json" + expected_key = "test-prefix/refs/tags/main/abc123.json" + result = remote_ops._ref_key(ref_path) + assert result == expected_key + + def test_ref_key_with_prefix(self, remote_ops, monkeypatch): + """Test ref key generation with a different prefix.""" + monkeypatch.setenv("DML_REMOTE_ROOT", "s3://test-bucket/myrepo") + # Need to recreate remote_ops to pick up the new prefix + remote_ops_with_prefix = RemoteOps( + _db=remote_ops._db, + client=remote_ops.client, + bucket="test-bucket", + prefix="myrepo", + ) + ref_path = "tags/main/abc123.json" + expected_key = "myrepo/refs/tags/main/abc123.json" + result = remote_ops_with_prefix._ref_key(ref_path) + assert result == expected_key + + def test_ref_key_rejects_path_traversal(self, remote_ops): + """Test that _ref_key rejects path traversal sequences.""" + # Test leading slash + with pytest.raises(ValueError, match="Invalid ref path"): + remote_ops._ref_key("/tags/main/abc123.json") + # Test double dot + with pytest.raises(ValueError, match="Invalid ref path"): + remote_ops._ref_key("../tags/main/abc123.json") + # Test double dot in middle + with pytest.raises(ValueError, match="Invalid ref path"): + remote_ops._ref_key("tags/main/../abc123.json") + + def test_dag_ref_path_and_key(self, remote_ops): + """Test DAG ref path/key helpers.""" + dag_id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + assert remote_ops._dag_ref_path(dag_id) == f"dags/{dag_id}.json" + assert remote_ops._dag_ref_key(dag_id) == f"test-prefix/refs/dags/{dag_id}.json" + + def test_dag_ref_helpers_reject_invalid_dag_id(self, remote_ops): + """Test DAG ref helpers reject invalid DAG ids.""" + with pytest.raises(ValueError, match="Invalid DAG id"): + remote_ops._dag_ref_path("abc") + with pytest.raises(ValueError, match="Invalid DAG id"): + remote_ops._dag_ref_key("ABC" * 21 + "A") + + def test_ref_key_still_rejects_dag_paths(self, remote_ops): + """Test that _ref_key still rejects dags/* paths.""" + dag_id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + with pytest.raises(ValueError, match="expected 'tags' or 'cache'"): + remote_ops._ref_key(f"dags/{dag_id}.json") + + +class TestRemoteWrappers: + """Tests for remote S3 thin wrappers.""" + + def test_remote_put_get_cas_roundtrip(self, remote_ops): + """Test putting and getting CAS objects roundtrip.""" + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + test_data = b"Hello, CAS world!" + + # Put the data + remote_ops._remote_put_cas(oid, test_data) + + # Get the data back + retrieved_data = remote_ops._remote_get_cas(oid) + assert retrieved_data == test_data + + def test_remote_has_cas_true_false(self, remote_ops): + """Test _remote_has_cas returns True for existing objects, False for non-existing.""" + oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + + # Clear any existing objects in the bucket first + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Initially should not exist + assert not remote_ops._remote_has_cas(oid) + + # Put some data + test_data = b"test data" + remote_ops._remote_put_cas(oid, test_data) + + # Now should exist + assert remote_ops._remote_has_cas(oid) + + def test_remote_put_ref_fails_if_exists(self, remote_ops): + """Test that _remote_put_ref fails if ref already exists.""" + ref_path = "tags/test/v1.json" + test_data = b'{"test": "data"}' + + # First put should succeed + remote_ops._remote_put_ref(ref_path, test_data) + + # Second put should fail + with pytest.raises(RefAlreadyExists): + remote_ops._remote_put_ref(ref_path, test_data) + + def test_remote_delete_ref(self, remote_ops): + """Test deleting refs.""" + ref_path = "tags/test/v2.json" + test_data = b'{"test": "delete me"}' + + # Put a ref + remote_ops._remote_put_ref(ref_path, test_data) + + # Verify it exists by trying to get it + retrieved_data = remote_ops._remote_get_ref(ref_path) + assert retrieved_data == test_data + + # Delete it + remote_ops._remote_delete_ref(ref_path) + + # Now getting it should fail + with pytest.raises(RemoteError): + remote_ops._remote_get_ref(ref_path) + + +class TestRemoteFixtures: + """Tests for remote operation fixtures.""" + + def test_remote_ops_fixture(self, remote_ops): + """Test that remote_ops fixture creates RemoteOps instance.""" + assert remote_ops is not None + assert hasattr(remote_ops, "_db") + assert hasattr(remote_ops, "client") + + def test_s3_fixture_creates_bucket(self, s3): + """Test that s3 fixture creates the expected bucket.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + # Try to list objects in the bucket - should not raise an exception + try: + s3.list_objects_v2(Bucket=bucket) + # If we get here, the bucket exists + # The bucket may contain descriptor files created by RemoteOps initialization + assert True # Bucket exists and is accessible + except s3.exceptions.NoSuchBucket: + raise AssertionError(f"Bucket {bucket} was not created") from None + + +class TestDecoding: + """Tests for manifest and ref decoding + validation.""" + + def test_decode_ref_valid(self, remote_ops): + """Test decoding a valid ref.""" + ref_data = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + } + ref_bytes = json.dumps(ref_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + decoded = remote_ops._decode_ref(ref_bytes) + assert decoded == ref_data + + def test_decode_ref_valid_targets(self, remote_ops): + """Test decoding a ref with valid dag targets.""" + dag1 = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag2 = "1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + ref_data = { + "kind": "ref", + "schema": 0, + "target": "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + "targets": {"dag": [dag1, dag2]}, + } + ref_bytes = json.dumps(ref_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + decoded = remote_ops._decode_ref(ref_bytes) + assert decoded == ref_data + + def test_decode_ref_valid_empty_targets(self, remote_ops): + """Test decoding a ref with empty dag targets.""" + ref_data = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + decoded = remote_ops._decode_ref(ref_bytes) + assert decoded == ref_data + + def test_decode_ref_rejects_wrong_kind_or_schema(self, remote_ops): + """Test that _decode_ref rejects invalid kind or schema.""" + # Wrong kind + invalid_ref = { + "kind": "manifest", # Wrong kind + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="kind must be 'ref'"): + remote_ops._decode_ref(ref_bytes) + + # Wrong schema + invalid_ref = { + "kind": "ref", + "schema": 1, # Wrong schema + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="schema must be 0"): + remote_ops._decode_ref(ref_bytes) + + # Invalid target (uppercase) + invalid_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF", # Uppercase + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="target must be 64 lowercase hex"): + remote_ops._decode_ref(ref_bytes) + + # Invalid target (wrong length) + invalid_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef", # Too short + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="target must be 64 lowercase hex"): + remote_ops._decode_ref(ref_bytes) + + # Invalid created_at (not int) + invalid_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": "1234567890", # String instead of int + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="created_at must be an integer"): + remote_ops._decode_ref(ref_bytes) + + def test_decode_manifest_valid(self, remote_ops): + """Test decoding a valid manifest.""" + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "commit": ["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"], + "blob": ["fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321"], + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + decoded = remote_ops._decode_manifest(manifest_bytes) + assert decoded == manifest_data + + def test_decode_manifest_rejects_unsorted_or_dupes(self, remote_ops): + """Test that _decode_manifest rejects unsorted lists or duplicates.""" + # Unsorted list + invalid_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "commit": [ + "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + ], # Unsorted + }, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="must be sorted"): + remote_ops._decode_manifest(manifest_bytes) + + # Duplicate OIDs + invalid_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "commit": [ + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + ], # Duplicate + }, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="must have no duplicates"): + remote_ops._decode_manifest(manifest_bytes) + + # Invalid OID in closure + invalid_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "commit": ["gggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg"], # Invalid OID + }, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="must be 64 lowercase hex"): + remote_ops._decode_manifest(manifest_bytes) + + +class TestClosureUnion: + """Tests for closure union helper.""" + + def test_closure_union_flattens(self, remote_ops): + """Test that _closure_union flattens OIDs from all kinds.""" + closure = { + "commit": ["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"], + "blob": ["fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321"], + "tree": ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], + } + expected_oids = { + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + } + result = remote_ops._closure_union(closure) + assert result == expected_oids + + def test_closure_union_dedupes_across_kinds(self, remote_ops): + """Test that _closure_union dedupes OIDs that appear in multiple kinds.""" + shared_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + closure = { + "commit": [shared_oid], + "blob": [shared_oid, "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321"], + } + expected_oids = { + shared_oid, + "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321", + } + result = remote_ops._closure_union(closure) + assert result == expected_oids + assert len(result) == 2 # Should not have duplicates + + +class TestLocalHelpers: + """Tests for local manifest helpers.""" + + def test_local_dump_dict_stops_at_child_dags_for_commit_root(self, remote_ops): + """Test that commit-root dumps do not traverse into child DAG closures.""" + mock_txn = Mock() + commit_ref = Ref("commit:test") + tree_ref = Ref("tree:tree1") + root_blob_ref = Ref("blob:blob1") + tree_blob_ref = Ref("blob:blob2") + child_dag_ref = Ref("dag:dag1") + child_blob_ref = Ref("blob:blob3") + + raw_map = { + commit_ref: "commit-raw", + tree_ref: "tree-raw", + root_blob_ref: "blob1-raw", + tree_blob_ref: "blob2-raw", + child_dag_ref: "dag-raw", + child_blob_ref: "blob3-raw", + } + objs = { + commit_ref: {"tree": tree_ref, "message": root_blob_ref}, + tree_ref: {"meta": tree_blob_ref, "dags": {"child": child_dag_ref}}, + root_blob_ref: {"value": "root"}, + tree_blob_ref: {"value": "tree"}, + child_dag_ref: {"payload": child_blob_ref}, + child_blob_ref: {"value": "child"}, + } + + mock_txn.get.side_effect = lambda ref: objs[ref] + mock_txn.txn.get.side_effect = lambda ref, raw=False: raw_map[ref] if raw else objs[ref] + + result = remote_ops._local_dump_dict(mock_txn, commit_ref) + + assert result == { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "test", + "closure": { + "commit": {"test": "commit-raw"}, + "tree": {"tree1": "tree-raw"}, + "blob": {"blob1": "blob1-raw", "blob2": "blob2-raw"}, + }, + } + + def test_local_dump_dict_stops_at_child_dags_for_dag_root(self, remote_ops): + """Test that DAG-root dumps include only the root DAG and directly owned objects.""" + mock_txn = Mock() + root_dag_ref = Ref("dag:root") + root_blob_ref = Ref("blob:blob1") + child_dag_ref = Ref("dag:child") + child_blob_ref = Ref("blob:blob2") + + raw_map = { + root_dag_ref: "root-dag-raw", + root_blob_ref: "root-blob-raw", + child_dag_ref: "child-dag-raw", + child_blob_ref: "child-blob-raw", + } + objs = { + root_dag_ref: {"payload": root_blob_ref, "child": child_dag_ref}, + root_blob_ref: {"value": "root"}, + child_dag_ref: {"payload": child_blob_ref}, + child_blob_ref: {"value": "child"}, + } + + mock_txn.get.side_effect = lambda ref: objs[ref] + mock_txn.txn.get.side_effect = lambda ref, raw=False: raw_map[ref] if raw else objs[ref] + + result = remote_ops._local_dump_dict(mock_txn, root_dag_ref) + + assert result == { + "kind": "local-manifest", + "schema": 0, + "root-ns": "dag", + "root-id": "root", + "closure": { + "dag": {"root": "root-dag-raw"}, + "blob": {"blob1": "root-blob-raw"}, + }, + } + + def test_local_has_uses_txn_get(self, remote_ops): + """Test that _local_has checks if ref exists using txn.get.""" + from daggerml._internal.types import DmlRepoError + + mock_txn = Mock() + + # Test when ref doesn't exist (txn.get raises DmlRepoError) + mock_txn.get.side_effect = DmlRepoError("Object not found") + assert not remote_ops._local_has(mock_txn, "commit", "testid") + mock_txn.get.assert_called_with(Ref("commit:testid")) + + # Reset the mock + mock_txn.reset_mock() + + # Test when ref exists + mock_txn.get.return_value = {"some": "data"} + mock_txn.get.side_effect = None + assert remote_ops._local_has(mock_txn, "commit", "testid") + + def test_local_put_head_writes_expected_key_and_value(self, remote_ops): + """Test that _local_put_head writes the expected local tracking file.""" + mock_txn = Mock() + remote_name = "s3://test-bucket/test-prefix" + ref_path = "tags/main/abc123.json" + commit_id = "def4567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef" + + def _exists(ref): + return ref == Ref(f"commit:{commit_id}") + + mock_txn.exists.side_effect = _exists + + remote_ops._local_put_head(mock_txn, remote_name, ref_path, commit_id) + + branch_name = f"{remote_name}/{ref_path}" + pointer_path = HeadOps(_db=remote_ops._db)._external_tracking_path(branch_name) + assert pointer_path.read_text(encoding="utf-8") == commit_id + + +class TestBuildRemoteManifest: + """Tests for remote manifest building from local manifest.""" + + def test_build_remote_manifest_overrides_dag_closure_with_direct_ids(self, remote_ops): + """Test direct_dag_ids override replaces transitive dag closure.""" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "commit": { + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "data1", + }, + "dag": { + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "dag-a", + "1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "dag-b", + }, + }, + } + + manifest_dict, _ = remote_ops._build_remote_manifest( + local_manifest, + direct_dag_ids=["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"], + ) + + assert manifest_dict["closure"]["dag"] == ["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"] + + def test_build_remote_manifest_sorts_each_namespace(self, remote_ops): + """Test that _build_remote_manifest sorts OIDs within each namespace.""" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "commit": { + "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321": "data2", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "data1", + }, + "blob": { + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb": "blob2", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa": "blob1", + }, + }, + } + + manifest_dict, manifest_bytes = remote_ops._build_remote_manifest(local_manifest) + + # Check manifest dict + expected_dict = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "commit": [ + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321", + ], + "blob": [ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + ], + }, + } + assert manifest_dict == expected_dict + + # Check canonical bytes + expected_json = ( + '{"closure":{"blob":["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",' + '"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"],"commit":' + '["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",' + '"fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321"]},"kind":' + '"manifest","root-id":"root1234567890abcdef1234567890abcdef1234567890abcdef' + '1234567890abcdef","root-ns":"commit","schema":0}' + ) + assert manifest_bytes.decode("utf-8") == expected_json + + def test_build_remote_manifest_dedupes(self, remote_ops): + """Test that _build_remote_manifest dedupes OIDs within each namespace.""" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "commit": { + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "data1", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "duplicate", # noqa: F601 # Same ID with different data (testing deduplication) + }, + }, + } + + manifest_dict, _ = remote_ops._build_remote_manifest(local_manifest) + + # Should only have one occurrence of the ID + assert manifest_dict["closure"]["commit"] == [ + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + ] + + def test_direct_dag_ids_for_commit_uses_tree_dags_only(self, remote_ops): + """Test direct dag discovery for commits uses only Tree.dags.""" + dag_a = Ref("dag:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef") + dag_b = Ref("dag:1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef") + stray_dag = Ref("dag:2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef") + tree_ref = Ref("tree:3123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef") + commit_ref = Ref("commit:4123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef") + txn = Mock() + txn.get.side_effect = [ + Commit(parents=[], tree=tree_ref, author="test", message="msg", dag=stray_dag), + Tree(dags={"a": dag_a, "b": dag_b}), + ] + assert remote_ops._direct_dag_ids(txn, commit_ref) == sorted([dag_a.id(), dag_b.id()]) + + def test_build_remote_manifest_rejects_non_commit_root(self, remote_ops): + """Test that _build_remote_manifest rejects non-commit root namespaces.""" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "blob", # Not "commit" + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": {}, + } + + with pytest.raises(ValueError, match="Cannot push non-commit root namespace: 'blob'"): + remote_ops._build_remote_manifest(local_manifest) + + def test_manifest_bytes_are_stable(self, remote_ops): + """Test that identical manifests produce identical canonical bytes.""" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "commit": {"0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": "data"}, + "blob": {"fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321": "moredata"}, + }, + } + + # Build twice + _, bytes1 = remote_ops._build_remote_manifest(local_manifest) + _, bytes2 = remote_ops._build_remote_manifest(local_manifest) + + # Should be identical + assert bytes1 == bytes2 + + # Verify it's canonical JSON + json_str = bytes1.decode("utf-8") + parsed = json.loads(json_str) + recreated = json.dumps(parsed, separators=(",", ":"), sort_keys=True).encode("utf-8") + assert recreated == bytes1 + + +class TestPushUploadObjects: + """Tests for push upload objects functionality.""" + + def test_push_uploads_only_missing_objects(self, remote_ops): + """Test that _push_upload_objects only uploads objects that don't exist remotely.""" + # Create test data + test_data1 = b"Hello, World!" + test_data2 = b"Goodbye, World!" + + # Compute SHA256 hashes + oid1 = hashlib.sha256(test_data1).hexdigest() + oid2 = hashlib.sha256(test_data2).hexdigest() + + # Encode as base64 + b64_data1 = base64.b64encode(test_data1).decode("ascii") + b64_data2 = base64.b64encode(test_data2).decode("ascii") + + # Create local manifest with both objects + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "blob": { + oid1: b64_data1, + oid2: b64_data2, + } + }, + } + + # Upload objects - both should be uploaded since they don't exist + remote_ops._push_upload_objects(local_manifest) + + # Verify both objects were uploaded + assert remote_ops._remote_has_cas(oid1) + assert remote_ops._remote_has_cas(oid2) + assert remote_ops._remote_get_cas(oid1) == test_data1 + assert remote_ops._remote_get_cas(oid2) == test_data2 + + # Now upload again - should not re-upload existing objects + # Mock the _remote_has_cas and _remote_put_cas to track calls + original_has_cas = remote_ops._remote_has_cas + original_put_cas = remote_ops._remote_put_cas + + has_cas_calls = [] + put_cas_calls = [] + + def mock_has_cas(oid): + has_cas_calls.append(oid) + return original_has_cas(oid) + + def mock_put_cas(oid, data): + put_cas_calls.append((oid, data)) + return original_put_cas(oid, data) + + remote_ops._remote_has_cas = mock_has_cas + remote_ops._remote_put_cas = mock_put_cas + + # Upload again - should check existence but not re-upload + remote_ops._push_upload_objects(local_manifest) + + # Should have checked both objects + assert oid1 in has_cas_calls + assert oid2 in has_cas_calls + + # Should not have uploaded anything (since both exist) + assert len(put_cas_calls) == 0 + + def test_push_rejects_bad_sha256_mismatch(self, remote_ops): + """Test that _push_upload_objects rejects objects with SHA256 mismatches.""" + # Create test data with wrong hash + test_data = b"Hello, World!" + wrong_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" # All f's + correct_oid = hashlib.sha256(test_data).hexdigest() + + # Make sure wrong_oid is actually different + assert wrong_oid != correct_oid + + # Encode as base64 + b64_data = base64.b64encode(test_data).decode("ascii") + + # Create local manifest with mismatched hash + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "blob": { + wrong_oid: b64_data, # Wrong OID for this data + } + }, + } + + # Should raise ValueError due to hash mismatch + with pytest.raises(ShaMismatch, match=f"SHA256 mismatch for object {wrong_oid}"): + remote_ops._push_upload_objects(local_manifest) + + +class TestPush: + """Tests for the full push functionality.""" + + def test_push_end_to_end_writes_cas_and_ref(self, integration_remote_ops_fn): + """Test that push end-to-end writes CAS objects and creates ref.""" + remote_ops = integration_remote_ops_fn + # Create test data + commit_data = b'{"kind": "commit", "tree": "tree123..."}' + blob_data = b"Hello, World!" + + # Compute SHA256 hashes + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + + # Create local manifest + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(commit_data).decode("ascii")}, + "blob": {blob_oid: base64.b64encode(blob_data).decode("ascii")}, + }, + } + # Mock _local_dump_dict to return our local manifest + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest) as mock_dump: + with patch.object(remote_ops, "_direct_dag_ids", return_value=[]): + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + # Push the head ref + ref_path = remote_ops.push("main") + + # Verify _local_dump_dict was called + mock_dump.assert_called_once() + + # Should return the ref path + assert ref_path == f"tags/main/{commit_oid}.json" + + # Verify CAS objects were uploaded + assert remote_ops._remote_has_cas(commit_oid) + assert remote_ops._remote_has_cas(blob_oid) + assert remote_ops._remote_get_cas(commit_oid) == commit_data + assert remote_ops._remote_get_cas(blob_oid) == blob_data + + # Verify manifest was uploaded + # Build expected remote manifest to get its hash + remote_manifest_dict, remote_manifest_bytes = remote_ops._build_remote_manifest(local_manifest) + manifest_id = hashlib.sha256(remote_manifest_bytes).hexdigest() + + manifest_bytes = remote_ops._remote_get_cas(manifest_id) + manifest = remote_ops._decode_manifest(manifest_bytes) + assert manifest["kind"] == "manifest" + assert manifest["root-ns"] == "commit" + assert manifest["root-id"] == commit_oid + assert sorted(manifest["closure"]["commit"]) == [commit_oid] + assert sorted(manifest["closure"]["blob"]) == [blob_oid] + + # Verify ref was created + ref_bytes = remote_ops._remote_get_ref(ref_path) + ref_obj = remote_ops._decode_ref(ref_bytes) + assert ref_obj["kind"] == "ref" + assert ref_obj["target"] == manifest_id + assert isinstance(ref_obj["created_at"], int) + assert ref_obj["targets"] == {"dag": []} + + def test_push_head_publishes_tag_ref(self, integration_remote_ops_fn): + """Test that pushing a head publishes a tag ref scoped by head name and commit id.""" + remote_ops = integration_remote_ops_fn + commit_data = b'{"kind":"commit","tree":"tree-head"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(commit_data).decode("ascii")}, + }, + } + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest) as mock_dump: + with patch.object(remote_ops, "_direct_dag_ids", return_value=[]): + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + ref_path = remote_ops.push("main") + + assert ref_path == f"tags/main/{commit_oid}.json" + assert mock_dump.call_args.args[1] == Ref(f"commit:{commit_oid}") + ref_bytes = remote_ops._remote_get_ref(ref_path) + ref_obj = remote_ops._decode_ref(ref_bytes) + assert ref_obj["kind"] == "ref" + assert ref_obj["target"] + assert ref_obj["targets"] == {"dag": []} + + def test_push_with_dag_targets_ensures_dag_refs_before_tag_ref(self, remote_ops): + """Test push computes targets and ensures DAG refs before writing tag ref.""" + commit_oid = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag_a = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag_b = "1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(b"commit").decode("ascii")}, + "dag": { + dag_b: base64.b64encode(b"dag-b").decode("ascii"), + dag_a: base64.b64encode(b"dag-a").decode("ascii"), + }, + }, + } + manifest_bytes = json.dumps( + { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_a, dag_b]}, + }, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8") + events = [] + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest): + with patch.object( + remote_ops, + "_direct_dag_ids", + return_value=[dag_a], + ): + with patch.object( + remote_ops, + "_ensure_dag_ref_in_txn", + side_effect=lambda dag_ref, _txn, _stack: events.append(("ensure", dag_ref.id())) or True, + ): + with patch.object( + remote_ops, "_push_upload_objects", side_effect=lambda _lm: events.append(("upload-raw",)) + ): + with patch.object( + remote_ops, + "_build_remote_manifest", + side_effect=lambda _lm, require_commit_root=True, direct_dag_ids=None: ( # noqa: ARG005 + { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": direct_dag_ids or []}, + }, + manifest_bytes, + ), + ): + with patch.object(remote_ops, "_remote_has_cas", return_value=False): + with patch.object( + remote_ops, + "_remote_put_cas", + side_effect=lambda oid, _data: events.append(("put-cas", oid)), + ): + with patch.object( + remote_ops, + "_remote_put_ref", + side_effect=lambda _path, data: events.append( + ("put-ref", json.loads(data)) + ), + ): + remote_ops.push("main") + + assert events[0:1] == [("ensure", dag_a)] + assert events[-1][0] == "put-ref" + assert events[-1][1]["targets"] == {"dag": [dag_a]} + + def test_push_ref_is_immutable(self, integration_remote_ops_fn): + """Test that pushing the same ref twice fails.""" + remote_ops = integration_remote_ops_fn + # Create test data (different from first test) + commit_data = b'{"kind": "commit", "tree": "tree456..."}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(commit_data).decode("ascii")}, + }, + } + # Mock _local_dump_dict + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest): + with patch.object(remote_ops, "_direct_dag_ids", return_value=[]): + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + # First push should succeed + ref_path = remote_ops.push("main") + assert ref_path == f"tags/main/{commit_oid}.json" + + # Public API wraps remote errors at subsystem boundary + with pytest.raises(DmlRepoError, match="already exists"): + remote_ops.push("main") + + def test_push_manifest_uploaded_and_addressable(self, integration_remote_ops_fn): + """Test that the manifest is uploaded and can be retrieved via its hash.""" + remote_ops = integration_remote_ops_fn + # Create test data (different from other tests) + commit_data = b'{"kind": "commit", "tree": "tree789..."}' + blob_data = b"Hello, World!" + + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(commit_data).decode("ascii")}, + "blob": {blob_oid: base64.b64encode(blob_data).decode("ascii")}, + }, + } + # Mock _local_dump_dict + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest): + with patch.object(remote_ops, "_direct_dag_ids", return_value=[]): + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + # Push + remote_ops.push("main") + + # Build expected remote manifest to get its hash + remote_manifest_dict, remote_manifest_bytes = remote_ops._build_remote_manifest(local_manifest) + expected_manifest_oid = hashlib.sha256(remote_manifest_bytes).hexdigest() + + # Verify manifest was uploaded with correct hash + assert remote_ops._remote_has_cas(expected_manifest_oid) + stored_manifest_bytes = remote_ops._remote_get_cas(expected_manifest_oid) + stored_manifest = remote_ops._decode_manifest(stored_manifest_bytes) + assert stored_manifest == remote_manifest_dict + + +class TestPull: + """Tests for the pull functionality.""" + + def test_load_ptr_in_txn_resolves_dag_refs_recursively(self, remote_ops): + """Test load_ptr_in_txn resolves closure['dag'] through refs/dags.""" + commit_data = b'{"kind":"commit"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + dag_data = b'{"kind":"dag-root"}' + dag_oid = hashlib.sha256(dag_data).hexdigest() + blob_oid = hashlib.sha256(b"blob-data").hexdigest() + blob_data = b"blob-data" + + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + dag_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "dag", + "root-id": dag_oid, + "closure": {"datum-scalar": [blob_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + dag_manifest_bytes = json.dumps(dag_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + dag_manifest_oid = hashlib.sha256(dag_manifest_bytes).hexdigest() + dag_ref = { + "kind": "ref", + "schema": 0, + "target": dag_manifest_oid, + "created_at": 1234567890, + "meta": {"dag": {"id": dag_oid}}, + } + + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(dag_manifest_oid, dag_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(dag_oid, dag_data) + remote_ops._remote_put_cas(blob_oid, blob_data) + remote_ops._remote_put_dag_ref( + dag_oid, + json.dumps(dag_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + + with remote_ops._tx(readonly=False) as txn: + with patch.object(remote_ops, "_local_has", return_value=False): + root_ref = remote_ops.load_ptr_in_txn(top_manifest_oid, txn, expected_root_ns="commit") + assert root_ref == Ref(f"commit:{commit_oid}") + assert txn.txn.get(Ref(f"commit:{commit_oid}"), raw=True) == base64.b64encode(commit_data).decode( + "ascii" + ) + assert txn.txn.get(Ref(f"datum-scalar:{blob_oid}"), raw=True) == base64.b64encode(blob_data).decode( + "ascii" + ) + assert txn.txn.get(Ref(f"dag:{dag_oid}"), raw=True) == base64.b64encode(dag_data).decode("ascii") + + def test_load_ptr_in_txn_fails_when_dag_ref_missing(self, remote_ops): + """Test strict failure when a referenced DAG ref is missing.""" + commit_data = b'{"kind":"commit-missing-dag-ref"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + dag_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdeb" + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + + with pytest.raises(DmlRepoError, match=rf"Ref dags/{dag_oid}\.json not found"): + with remote_ops._tx(readonly=False) as txn: + with patch.object(remote_ops, "_local_has", return_value=False): + remote_ops.load_ptr_in_txn(top_manifest_oid, txn, expected_root_ns="commit") + + def test_load_ptr_in_txn_fails_when_dag_manifest_missing(self, remote_ops): + """Test strict failure when DAG ref target CAS is missing.""" + commit_data = b'{"kind":"commit-missing-dag-manifest"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + dag_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdec" + missing_manifest_oid = "3123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + dag_ref = { + "kind": "ref", + "schema": 0, + "target": missing_manifest_oid, + "created_at": 1234567890, + "meta": {"dag": {"id": dag_oid}}, + } + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_dag_ref( + dag_oid, + json.dumps(dag_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + + with remote_ops._tx(readonly=False) as txn: + with pytest.raises(MissingCasObject, match=f"CAS object {missing_manifest_oid} not found"): + remote_ops.load_ptr_in_txn(top_manifest_oid, txn, expected_root_ns="commit") + + def test_pull_resolves_dag_refs(self, integration_remote_ops_fn): + """Test pull materializes child DAG manifests through refs/dags.""" + remote_ops = integration_remote_ops_fn + commit_data = b'{"kind":"commit","tree":"tree123"}' + blob_data = b"blob-data" + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + dag_data = b'{"kind":"dag-root-pull"}' + dag_oid = hashlib.sha256(dag_data).hexdigest() + + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + dag_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "dag", + "root-id": dag_oid, + "closure": {"datum-scalar": [blob_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + dag_manifest_bytes = json.dumps(dag_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + dag_manifest_oid = hashlib.sha256(dag_manifest_bytes).hexdigest() + top_ref = { + "kind": "ref", + "schema": 0, + "target": top_manifest_oid, + "created_at": 1234567890, + "targets": {"dag": [dag_oid]}, + } + dag_ref = { + "kind": "ref", + "schema": 0, + "target": dag_manifest_oid, + "created_at": 1234567890, + "meta": {"dag": {"id": dag_oid}}, + } + + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(dag_manifest_oid, dag_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(dag_oid, dag_data) + remote_ops._remote_put_cas(blob_oid, blob_data) + remote_ops._remote_put_ref( + "tags/main/with-dag.json", + json.dumps(top_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + remote_ops._remote_put_dag_ref( + dag_oid, + json.dumps(dag_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + + with patch.object(remote_ops, "_local_has", return_value=False): + remote_ops.pull("tags/main/with-dag.json") + with remote_ops._tx(readonly=True) as txn: + assert txn.txn.get(Ref(f"commit:{commit_oid}"), raw=True) == base64.b64encode(commit_data).decode( + "ascii" + ) + assert txn.txn.get(Ref(f"datum-scalar:{blob_oid}"), raw=True) == base64.b64encode(blob_data).decode( + "ascii" + ) + assert txn.txn.get(Ref(f"dag:{dag_oid}"), raw=True) == base64.b64encode(dag_data).decode("ascii") + + def test_pull_fails_when_dag_ref_is_malformed(self, remote_ops): + """Test pull fails on malformed DAG refs.""" + commit_oid = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"dag": [dag_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + top_ref = { + "kind": "ref", + "schema": 0, + "target": top_manifest_oid, + "created_at": 1234567890, + "targets": {"dag": [dag_oid]}, + } + + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_ref( + "tags/main/bad-dag-ref.json", + json.dumps(top_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + remote_ops.client.put_object( + Bucket=remote_ops.bucket, + Key=remote_ops._dag_ref_key(dag_oid), + Body=json.dumps({"kind": "not-ref", "schema": 0}).encode("utf-8"), + ) + + with pytest.raises(DmlRepoError, match="kind must be 'ref'"): + remote_ops.pull("tags/main/bad-dag-ref.json") + + def test_load_ptr_in_txn_fails_when_dag_ref_missing_even_if_raw_dag_exists(self, remote_ops): + """Test readers no longer fall back to raw DAG CAS without refs/dags.""" + commit_data = b'{"kind":"commit-legacy-dag"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + dag_data = b'{"kind":"dag-legacy"}' + dag_oid = hashlib.sha256(dag_data).hexdigest() + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(dag_oid, dag_data) + + with pytest.raises(DmlRepoError, match=rf"Ref dags/{dag_oid}\.json not found"): + with remote_ops._tx(readonly=False) as txn: + with patch.object(remote_ops, "_local_has", return_value=False): + remote_ops.load_ptr_in_txn(top_manifest_oid, txn, expected_root_ns="commit") + + def test_pull_rejects_non_commit_root(self, remote_ops): + """Test that pull rejects manifests with non-commit root namespace.""" + # Create a manifest with non-commit root + invalid_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "blob", # Not "commit" + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": {"blob": ["fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321"]}, + } + manifest_bytes = json.dumps(invalid_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Create a ref pointing to this manifest + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload the invalid manifest and ref + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + ref_path = "tags/main/invalid-root-ref.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + # Public API wraps remote errors at subsystem boundary + with pytest.raises(DmlRepoError, match="Manifest root namespace mismatch"): + remote_ops.pull(ref_path) + + def test_pull_downloads_missing_objects_only(self, integration_remote_ops_fn): + """Test that pull downloads only objects that are not already local.""" + remote_ops = integration_remote_ops_fn + # Create test data + commit_data = b'{"kind": "commit", "tree": "tree123"}' + blob_data = b"Hello, World!" + + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + + # Create manifest + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": [commit_oid], + "datum-scalar": [blob_oid], + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + + # Create ref + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload to remote + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(blob_oid, blob_data) + ref_path = "tags/main/test-missing.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + with remote_ops._tx(readonly=False) as txn: + inserted_ref = remote_ops._put_local_cas_object(txn, "commit", commit_oid, commit_data) + assert inserted_ref == Ref(f"commit:{commit_oid}") + + remote_ops.pull(ref_path) + + with remote_ops._tx(readonly=True) as txn: + assert txn.txn.get(Ref(f"commit:{commit_oid}"), raw=True) == base64.b64encode(commit_data).decode("ascii") + assert txn.txn.get(Ref(f"datum-scalar:{blob_oid}"), raw=True) == base64.b64encode(blob_data).decode("ascii") + + def test_pull_verifies_sha256(self, remote_ops): + """Test that pull verifies SHA256 of downloaded objects.""" + # Create test data with wrong content + blob_data = b"Hello, World!" + wrong_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" # Wrong OID + + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "blob": [wrong_oid], # OID doesn't match data + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload with wrong data + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + remote_ops._remote_put_cas(wrong_oid, blob_data) # Wrong data for this OID + ref_path = "tags/main/test-sha256.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + with patch.object(remote_ops, "_local_has", return_value=False): + with pytest.raises(DmlRepoError, match=f"SHA256 mismatch for object {wrong_oid}"): + remote_ops.pull(ref_path) + + def test_pull_writes_head_pointer(self, integration_remote_ops_fn): + """Test that pull writes the correct head pointer.""" + remote_ops = integration_remote_ops_fn + # Create test data + commit_data = b'{"kind": "commit", "tree": "tree123"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + # Create manifest + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": [commit_oid], + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + # Create ref + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + # Upload to remote + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + ref_path = "tags/main/test-head.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + # Pull should succeed and write head pointer + remote_ops.pull(ref_path) + # Verify head pointer was written + remote_name = f"s3://{remote_ops.bucket}" + if remote_ops.prefix: + remote_name = f"s3://{remote_ops.bucket}/{remote_ops.prefix}" + branch_name = f"{remote_name}/{ref_path}" + + pointer_path = HeadOps(_db=remote_ops._db)._external_tracking_path(branch_name) + assert pointer_path.read_text(encoding="utf-8") == commit_oid + + +class TestTask17PublicApiPolish: + """Tests for Task 17: Public API polish with typed exceptions.""" + + def test_push_raises_on_non_commit_root(self, remote_ops, db): + """Test that push surfaces non-commit root namespace as DmlRepoError.""" + # Create local manifest with non-commit root + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "blob", # Not "commit" + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": {}, + } + + # Mock _local_dump_dict to return the invalid manifest + with patch.object(remote_ops, "_local_dump_dict", return_value=local_manifest): + commit_ref = Ref("commit:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890") + with patch.object( + remote_ops, + "_resolve_branch_push_target", + return_value=(commit_ref, "tags/main/test.json"), + ): + with patch.object(remote_ops, "_direct_dag_ids", return_value=[]): + with pytest.raises(DmlRepoError, match="Cannot push non-commit root namespace: 'blob'"): + remote_ops.push("main") + + def test_pull_raises_on_missing_cas_object(self, remote_ops): + """Test that pull surfaces missing CAS objects as DmlRepoError.""" + # Clear the bucket first + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create a ref pointing to a manifest that doesn't exist + manifest_id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload the ref but not the manifest + ref_path = "tags/main/missing-manifest.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + # Public API wraps remote errors at subsystem boundary + with pytest.raises(DmlRepoError, match=f"CAS object {manifest_id} not found"): + remote_ops.pull(ref_path) + + def test_decode_ref_raises_invalid_ref(self, remote_ops): + """Test that _decode_ref raises InvalidRef for invalid refs.""" + # Test invalid kind + invalid_ref = { + "kind": "invalid", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="kind must be 'ref'"): + remote_ops._decode_ref(ref_bytes) + + # Test invalid schema + invalid_ref = { + "kind": "ref", + "schema": 1, # Invalid + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="schema must be 0"): + remote_ops._decode_ref(ref_bytes) + + # Test invalid target + invalid_ref = { + "kind": "ref", + "schema": 0, + "target": "invalid-target", + "created_at": 1234567890, + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="target must be 64 lowercase hex"): + remote_ops._decode_ref(ref_bytes) + + # Test invalid targets object + invalid_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + "targets": [], + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="targets must be an object"): + remote_ops._decode_ref(ref_bytes) + + # Test invalid targets namespace + invalid_ref["targets"] = {"blob": []} + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="targets supports only the 'dag' namespace"): + remote_ops._decode_ref(ref_bytes) + + # Test unsorted dag targets + invalid_ref["targets"] = { + "dag": [ + "1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + ] + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="targets.dag must be a sorted unique list of 64 lowercase hex ids"): + remote_ops._decode_ref(ref_bytes) + + # Test duplicate dag targets + invalid_ref["targets"] = { + "dag": [ + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + ] + } + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="targets.dag must be a sorted unique list of 64 lowercase hex ids"): + remote_ops._decode_ref(ref_bytes) + + # Test malformed dag target id + invalid_ref["targets"] = {"dag": ["not-an-oid"]} + ref_bytes = json.dumps(invalid_ref).encode("utf-8") + with pytest.raises(InvalidRef, match="targets.dag must be a sorted unique list of 64 lowercase hex ids"): + remote_ops._decode_ref(ref_bytes) + + +class TestDagPublicationHelpers: + """Tests for per-DAG publication helpers.""" + + def test_put_ref_manifest_uploads_top_manifest_and_ensures_dags(self, remote_ops, monkeypatch): + """Test top-level manifest upload via put_ref_manifest.""" + dag_a = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag_b = "1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": { + "commit": { + "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef": base64.b64encode( + b"commit" + ).decode("ascii") + }, + "dag": { + dag_b: base64.b64encode(b"dag-b").decode("ascii"), + dag_a: base64.b64encode(b"dag-a").decode("ascii"), + }, + }, + } + manifest_bytes = ( + b'{"closure":{"commit":["2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"],' + b'"dag":["0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",' + b'"1123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]},' + b'"kind":"manifest","root-id":"2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",' + b'"root-ns":"commit","schema":0}' + ) + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + ensured = [] + uploaded = [] + + monkeypatch.setattr(remote_ops, "_local_dump_dict", lambda _txn, _ref: local_manifest) + monkeypatch.setattr(remote_ops, "_direct_dag_ids", lambda _txn, _root_ref: [dag_a, dag_b]) + monkeypatch.setattr( + remote_ops, "_ensure_dag_ref_in_txn", lambda dag_ref, _txn, _stack: ensured.append(dag_ref.id()) or True + ) + monkeypatch.setattr( + remote_ops, "_push_upload_objects", lambda manifest: uploaded.append(("raw", manifest["root-id"])) + ) + monkeypatch.setattr( + remote_ops, + "_build_remote_manifest", + lambda _manifest, require_commit_root=False, direct_dag_ids=None: ({"kind": "manifest"}, manifest_bytes), + ) # noqa: ARG005 + monkeypatch.setattr(remote_ops, "_remote_has_cas", lambda _oid: False) + monkeypatch.setattr(remote_ops, "_remote_put_cas", lambda oid, data: uploaded.append((oid, data))) + + assert ( + remote_ops.put_ref_manifest(Ref("commit:2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef")) + == manifest_oid + ) + assert ensured == [dag_a, dag_b] + assert uploaded[0] == ("raw", local_manifest["root-id"]) + assert uploaded[1] == (manifest_oid, manifest_bytes) + + def test_put_cache_ref_writes_targets(self, remote_ops): + """Test cache refs include validated targets.""" + target = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dag_id = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + remote_ops.put_cache_ref("cache:key", target, targets={"dag": [dag_id]}, execution_id="exec-1") + + ref_obj = remote_ops._decode_ref(remote_ops._remote_get_ref("cache/cache:key.json")) + assert ref_obj["target"] == target + assert ref_obj["targets"] == {"dag": [dag_id]} + assert ref_obj["execution_id"] == "exec-1" + + def test_put_cache_ref_rejects_existing_ref(self, remote_ops): + target = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + remote_ops.put_cache_ref("cache:guarded", target, targets={"dag": []}, execution_id="exec-live") + + with pytest.raises(DmlRepoError, match="already exists"): + remote_ops.put_cache_ref("cache:guarded", target, targets={"dag": []}, execution_id="exec-next") + + def test_delete_cache_ref_if_execution_id_is_guarded(self, remote_ops): + target = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + cache_key = "cache:guarded-delete" + remote_ops.put_cache_ref(cache_key, target, targets={"dag": []}, execution_id="exec-live") + + assert remote_ops.delete_cache_ref_if_execution_id(cache_key, "exec-stale") is False + assert remote_ops.get_cache_ref_info(cache_key)["execution_id"] == "exec-live" + + def test_invalidate_cache_plans_reverse_closure(self, remote_ops): + now = int(time.time()) + _put_remote_json( + remote_ops, + "exec/state/e-root.json", + { + "execution_id": "e-root", + "cache_key": "ck-root", + "lifecycle": "succeeded", + "updated_at": now, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + }, + ) + _put_remote_json( + remote_ops, + "exec/state/e-caller.json", + { + "execution_id": "e-caller", + "cache_key": "ck-caller", + "lifecycle": "succeeded", + "updated_at": now, + "spawned_execution_ids": ["e-root"], + "cancellation_requested_by": None, + }, + ) + _put_remote_json( + remote_ops, + "exec/edges/e-root/e-caller.json", + {"caller_execution_id": "e-caller", "callee_execution_id": "e-root"}, + ) + target = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + remote_ops.put_cache_ref("ck-root", target, targets={"dag": []}, execution_id="e-root") + remote_ops.put_cache_ref("ck-caller", target, targets={"dag": []}, execution_id="e-caller") + + result = remote_ops.invalidate_cache(["ck-root"], requested_by="alice@example.com") + + assert result["invalidated_execution_ids"] == ["e-caller", "e-root"] + assert remote_ops.get_cache_ref_info("ck-root") is None + assert remote_ops.get_cache_ref_info("ck-caller") is None + root_tombstone, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_invalidate_key("e-root")) + caller_tombstone, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_invalidate_key("e-caller")) + assert root_tombstone["requested_by"] == "alice@example.com" + assert caller_tombstone["requested_by"] == "alice@example.com" + + def test_cancel_executions_skips_shared_and_terminal_dependencies(self, remote_ops): + now = int(time.time()) + for state in [ + { + "execution_id": "e-root", + "cache_key": "ck-root", + "lifecycle": "running", + "updated_at": now, + "spawned_execution_ids": ["e-sole", "e-shared", "e-terminal"], + "cancellation_requested_by": None, + }, + { + "execution_id": "e-sole", + "cache_key": "ck-sole", + "lifecycle": "running", + "updated_at": now, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + }, + { + "execution_id": "e-shared", + "cache_key": "ck-shared", + "lifecycle": "running", + "updated_at": now, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + }, + { + "execution_id": "e-other", + "cache_key": "ck-other", + "lifecycle": "running", + "updated_at": now, + "spawned_execution_ids": ["e-shared"], + "cancellation_requested_by": None, + }, + { + "execution_id": "e-terminal", + "cache_key": "ck-terminal", + "lifecycle": "succeeded", + "updated_at": now, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + }, + ]: + _put_remote_json(remote_ops, f"exec/state/{state['execution_id']}.json", state) + for callee, caller in [ + ("e-sole", "e-root"), + ("e-shared", "e-root"), + ("e-shared", "e-other"), + ("e-terminal", "e-root"), + ]: + _put_remote_json( + remote_ops, + f"exec/edges/{callee}/{caller}.json", + {"caller_execution_id": caller, "callee_execution_id": callee}, + ) + + result = remote_ops.cancel_executions(["e-root"], requested_by="alice@example.com") + + assert result["cancel_pending_execution_ids"] == ["e-sole", "e-root"] + root_state, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_state_key("e-root")) + sole_state, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_state_key("e-sole")) + shared_state, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_state_key("e-shared")) + terminal_state, _ = remote_ops._get_json_key_with_etag(remote_ops._execution_state_key("e-terminal")) + assert root_state["lifecycle"] == "cancel-pending" + assert sole_state["lifecycle"] == "cancel-pending" + assert shared_state["lifecycle"] == "running" + assert terminal_state["lifecycle"] == "succeeded" + + def test_put_cache_ref_rejects_invalid_targets(self, remote_ops): + """Test cache ref validation rejects malformed targets.""" + target = "2123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + with pytest.raises(DmlRepoError, match="Invalid targets"): + remote_ops.put_cache_ref("cache:key", target, targets={"blob": []}, execution_id="exec-1") + + def test_decode_manifest_raises_invalid_manifest(self, remote_ops): + """Test that _decode_manifest raises InvalidManifest for invalid manifests.""" + # Test invalid kind + invalid_manifest = { + "kind": "invalid", + "schema": 0, + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": {}, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="kind must be 'manifest'"): + remote_ops._decode_manifest(manifest_bytes) + + # Test invalid schema + invalid_manifest = { + "kind": "manifest", + "schema": 1, # Invalid + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": {}, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="schema must be 0"): + remote_ops._decode_manifest(manifest_bytes) + + # Test invalid OID in closure + invalid_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": { + "commit": ["invalid-oid"], + }, + } + manifest_bytes = json.dumps(invalid_manifest).encode("utf-8") + with pytest.raises(InvalidManifest, match="must be 64 lowercase hex"): + remote_ops._decode_manifest(manifest_bytes) + + def test_cas_key_raises_invalid_oid(self, remote_ops): + """Test that _cas_key raises InvalidOid for invalid OIDs.""" + # Test invalid characters + with pytest.raises(InvalidOid, match="Invalid OID"): + remote_ops._cas_key("gggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg") + + # Test wrong length + with pytest.raises(InvalidOid, match="Invalid OID"): + remote_ops._cas_key("0123456789abcdef") + + def test_push_upload_objects_raises_sha_mismatch(self, remote_ops): + """Test that _push_upload_objects raises ShaMismatch for SHA mismatches.""" + # Create data with wrong OID + test_data = b"Hello, World!" + wrong_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" # All f's + correct_oid = hashlib.sha256(test_data).hexdigest() + + # Make sure they're different + assert wrong_oid != correct_oid + + # Create local manifest with mismatched hash + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "root1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + "closure": { + "blob": { + wrong_oid: base64.b64encode(test_data).decode("ascii"), + } + }, + } + + # Should raise ShaMismatch + with pytest.raises(ShaMismatch, match=f"SHA256 mismatch for object {wrong_oid}"): + remote_ops._push_upload_objects(local_manifest) + + def test_pull_raises_sha_mismatch_on_bad_cas_data(self, remote_ops): + """Test that pull raises ShaMismatch when downloaded CAS data has wrong hash.""" + # Create test data with wrong content + blob_data = b"Hello, World!" + wrong_oid = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" + + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890", + "closure": { + "blob": [wrong_oid], + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload with wrong data + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + remote_ops._remote_put_cas(wrong_oid, blob_data) # Wrong data for this OID + ref_path = "tags/main/test-sha-mismatch.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + with patch.object(remote_ops, "_local_has", return_value=False): + with pytest.raises(DmlRepoError, match=f"SHA256 mismatch for object {wrong_oid}"): + remote_ops.pull(ref_path) + + +class TestGcMark: + """Tests for GC mark phase.""" + + def test_gc_mark_includes_manifest_and_closure_oids(self, remote_ops): + """Test that _gc_mark includes manifest targets and closure OIDs.""" + # Clear the bucket to ensure clean state + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create test data for commit and blob objects + commit_data = b'{"kind": "commit", "tree": "tree123"}' + blob_data = b"Hello, World!" + tree_data = b'{"kind": "tree", "entries": []}' + + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + tree_oid = hashlib.sha256(tree_data).hexdigest() + + # Create a manifest that references these objects + manifest_data = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": [commit_oid], + "blob": [blob_oid], + "tree": [tree_oid], + }, + } + manifest_bytes = json.dumps(manifest_data, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_id = hashlib.sha256(manifest_bytes).hexdigest() + + # Create a commit ref pointing to the manifest + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Upload all CAS objects + remote_ops._remote_put_cas(manifest_id, manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(blob_oid, blob_data) + remote_ops._remote_put_cas(tree_oid, tree_data) + + # Upload the commit ref + ref_path = "tags/main/test-commit.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + # Run GC mark + live_oids = remote_ops._gc_mark() + + # Should include: + # - manifest_id (ref target) + # - commit_oid, blob_oid, tree_oid (from closure) + expected_oids = {manifest_id, commit_oid, blob_oid, tree_oid} + assert live_oids == expected_oids + + # Test with multiple refs (tags + cache) + # Create a cache ref + cache_ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_id, # Same manifest + "created_at": 1234567891, + "targets": {"dag": []}, + } + cache_ref_bytes = json.dumps(cache_ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + cache_ref_path = "cache/test-cache.json" + remote_ops._remote_put_ref(cache_ref_path, cache_ref_bytes) + + # Run GC mark again + live_oids = remote_ops._gc_mark() + + # Should still include the same OIDs (manifest referenced by both refs) + assert live_oids == expected_oids + + def test_gc_mark_warns_and_deletes_malformed_manifest_by_default(self, remote_ops, caplog): + """Test default malformed='warn' behavior for malformed manifests.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + invalid_manifest = { + "kind": "not-a-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": {}, + } + manifest_bytes = json.dumps(invalid_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_oid, + "created_at": 1234567890, + "targets": {"dag": []}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + remote_ops._remote_put_cas(manifest_oid, manifest_bytes) + remote_ops._remote_put_ref("tags/main/invalid-manifest.json", ref_bytes) + + live_oids = remote_ops._gc_mark() + assert live_oids == {manifest_oid} + assert not remote_ops._remote_has_cas(manifest_oid) + assert f"Malformed manifest {manifest_oid}: kind must be 'manifest'" in caplog.text + + def test_gc_mark_raises_on_malformed_manifest_when_requested(self, remote_ops): + """Test malformed='raise' fails with a clear message.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + invalid_manifest = { + "kind": "not-a-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": {}, + } + manifest_bytes = json.dumps(invalid_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_oid, + "created_at": 1234567890, + "targets": {"dag": []}, + } + remote_ops._remote_put_cas(manifest_oid, manifest_bytes) + remote_ops._remote_put_ref( + "tags/main/invalid-manifest.json", + json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + + with pytest.raises(DmlRepoError, match=rf"Malformed manifest {manifest_oid}: kind must be 'manifest'"): + remote_ops._gc_mark(malformed="raise") + assert remote_ops._remote_has_cas(manifest_oid) + + def test_gc_mark_ignores_warning_but_deletes_malformed_manifest(self, remote_ops, caplog): + """Test malformed='ignore' suppresses warnings but still deletes malformed objects.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + invalid_manifest = { + "kind": "not-a-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "closure": {}, + } + manifest_bytes = json.dumps(invalid_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + ref_obj = { + "kind": "ref", + "schema": 0, + "target": manifest_oid, + "created_at": 1234567890, + "targets": {"dag": []}, + } + remote_ops._remote_put_cas(manifest_oid, manifest_bytes) + remote_ops._remote_put_ref( + "tags/main/invalid-manifest.json", + json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + + caplog.clear() + live_oids = remote_ops._gc_mark(malformed="ignore") + assert live_oids == {manifest_oid} + assert not remote_ops._remote_has_cas(manifest_oid) + assert f"Malformed manifest {manifest_oid}" not in caplog.text + + def test_gc_mark_raises_on_malformed_root_ref_when_requested(self, remote_ops): + """Test malformed='raise' names the bad root ref and reason.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + remote_ops.client.put_object( + Bucket=remote_ops.bucket, + Key=remote_ops._ref_key("tags/main/bad.json"), + Body=json.dumps({"kind": "nope", "schema": 0}).encode("utf-8"), + ) + + with pytest.raises(DmlRepoError, match=r"Malformed ref refs/tags/main/bad.json: kind must be 'ref'"): + remote_ops._gc_mark(malformed="raise") + + def test_gc_mark_follows_dag_refs_not_dag_refs_as_roots(self, remote_ops): + """Test GC follows DAG refs only from tag/cache roots.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + commit_data = b'{"kind":"commit"}' + blob_data = b"blob-data" + dag_data = b'{"kind":"dag-root"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + dag_oid = hashlib.sha256(dag_data).hexdigest() + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + dag_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "dag", + "root-id": dag_oid, + "closure": {"blob": [blob_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + dag_manifest_bytes = json.dumps(dag_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + dag_manifest_oid = hashlib.sha256(dag_manifest_bytes).hexdigest() + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(dag_manifest_oid, dag_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_cas(dag_oid, dag_data) + remote_ops._remote_put_cas(blob_oid, blob_data) + remote_ops._remote_put_ref( + "tags/main/gc.json", + json.dumps( + { + "kind": "ref", + "schema": 0, + "target": top_manifest_oid, + "created_at": 1, + "targets": {"dag": [dag_oid]}, + }, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8"), + ) + remote_ops._remote_put_dag_ref( + dag_oid, + json.dumps( + { + "kind": "ref", + "schema": 0, + "target": dag_manifest_oid, + "created_at": 1, + "meta": {"dag": {"id": dag_oid}}, + }, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8"), + ) + + live_oids = remote_ops._gc_mark() + assert {top_manifest_oid, dag_manifest_oid, commit_oid, dag_oid, blob_oid}.issubset(live_oids) + + def test_gc_mark_skips_missing_dag_ref(self, remote_ops): + """Test GC skips missing DAG refs listed in targets/closure.""" + commit_data = b'{"kind":"commit"}' + commit_oid = hashlib.sha256(commit_data).hexdigest() + dag_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + top_manifest = { + "kind": "manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": {"commit": [commit_oid], "dag": [dag_oid]}, + } + top_manifest_bytes = json.dumps(top_manifest, separators=(",", ":"), sort_keys=True).encode("utf-8") + top_manifest_oid = hashlib.sha256(top_manifest_bytes).hexdigest() + remote_ops._remote_put_cas(top_manifest_oid, top_manifest_bytes) + remote_ops._remote_put_cas(commit_oid, commit_data) + remote_ops._remote_put_ref( + "tags/main/missing-dag-ref.json", + json.dumps( + { + "kind": "ref", + "schema": 0, + "target": top_manifest_oid, + "created_at": 1, + "targets": {"dag": [dag_oid]}, + }, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8"), + ) + + live_oids = remote_ops._gc_mark() + assert top_manifest_oid in live_oids + assert commit_oid in live_oids + assert dag_oid not in live_oids + + +class TestGcSweep: + """Tests for GC sweep phase.""" + + def test_gc_sweep_deletes_only_unreferenced_and_old(self, remote_ops): + """Test that GC sweep deletes only unreferenced objects that are old enough.""" + # Clear the bucket first + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create test OIDs + live_oid = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + dead_old_oid = "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321" + dead_young_oid = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + + # Upload CAS objects with different timestamps + + # Live object (should be kept) + remote_ops._remote_put_cas(live_oid, b"live data") + + # Dead old object (should be deleted) + remote_ops._remote_put_cas(dead_old_oid, b"dead old data") + + # Dead young object (should be kept due to age) + remote_ops._remote_put_cas(dead_young_oid, b"dead young data") + + # Mock the LastModified timestamps to simulate different ages + # This is tricky with moto, so we'll use a small min_age_seconds and assume + # the objects are old enough, or we could patch the list_objects_v2 response + # For this test, we'll set min_age_seconds=0 to test reachability logic + + # Live OIDs set contains only the live OID + live_oids = {live_oid} + + # Run sweep with min_age_seconds=0 (so age doesn't prevent deletion) + result = remote_ops._gc_sweep(live_oids, min_age_seconds=0) + + # Should have deleted the dead_old_oid, kept live_oid and dead_young_oid + # (but since moto might not preserve exact timestamps, we'll just check the logic) + assert "deleted" in result + assert "kept_live" in result + assert "kept_young" in result + + # Verify live object still exists + assert remote_ops._remote_has_cas(live_oid) + + # Verify dead objects were deleted (since min_age_seconds=0) + # Note: This test may need adjustment based on how moto handles timestamps + # For now, let's just ensure the method runs without error + + def test_gc_does_not_delete_live_objects(self, remote_ops): + """Test that GC does not delete objects that are in live_oids.""" + # Clear the bucket first + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create test OIDs + live_oid1 = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + live_oid2 = "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321" + + # Upload live objects + remote_ops._remote_put_cas(live_oid1, b"live data 1") + remote_ops._remote_put_cas(live_oid2, b"live data 2") + + # Live OIDs set contains both + live_oids = {live_oid1, live_oid2} + + # Run sweep + result = remote_ops._gc_sweep(live_oids, min_age_seconds=0) + + # Should keep both live objects + assert result["kept_live"] >= 2 # At least the two we know about + + # Verify both objects still exist + assert remote_ops._remote_has_cas(live_oid1) + assert remote_ops._remote_has_cas(live_oid2) + + def test_gc_sweep_raises_on_malformed_cas_key(self, remote_ops): + """Test that _gc_sweep fails closed when CAS key layout is malformed.""" + bucket, _prefix = remote_bucket_and_prefix_from_env() + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + malformed_tail = "not-an-oid" + if remote_ops.prefix: + key = f"{remote_ops.prefix}/cas/sha256/00/00/{malformed_tail}" + else: + key = f"cas/sha256/00/00/{malformed_tail}" + remote_ops.client.put_object(Bucket=bucket, Key=key, Body=b"junk") + + with pytest.raises(InvalidOid, match="Invalid CAS key"): + remote_ops._gc_sweep(set(), min_age_seconds=0) + + +class TestGc: + """Tests for the full GC functionality.""" + + def test_gc_calls_prune_and_sweep(self, remote_ops): + """Test that gc() calls prune() and performs sweep.""" + # Mock the methods to verify they're called + with patch.object(remote_ops, "prune", return_value=0) as mock_prune: + with patch.object(remote_ops, "_gc_mark", return_value=set()) as mock_mark: + with patch.object( + remote_ops, "_gc_sweep", return_value={"deleted": 0, "kept_live": 0, "kept_young": 0} + ) as mock_sweep: + result = remote_ops.gc(min_age_seconds=100) + + # Verify methods were called + mock_prune.assert_called_once() + mock_mark.assert_called_once_with(malformed="warn") + mock_sweep.assert_called_once_with(set(), 100) + + # Verify result is returned + assert result == {"deleted": 0, "kept_live": 0, "kept_young": 0} + + def test_gc_passes_through_explicit_malformed_policy(self, remote_ops): + """Test gc forwards malformed policy to mark phase.""" + with patch.object(remote_ops, "prune", return_value=0): + with patch.object(remote_ops, "_gc_mark", return_value=set()) as mock_mark: + with patch.object( + remote_ops, "_gc_sweep", return_value={"deleted": 0, "kept_live": 0, "kept_young": 0} + ): + remote_ops.gc(min_age_seconds=100, malformed="raise") + mock_mark.assert_called_once_with(malformed="raise") + + +class TestList: + """Tests for the list functionality.""" + + def test_list_returns_decoded_refs(self, remote_ops): + """Test that list returns properly decoded refs with ref_path.""" + # Clear any existing refs from previous tests + bucket, prefix = remote_bucket_and_prefix_from_env() + refs_prefix = f"{prefix}/refs/" if prefix else "refs/" + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=refs_prefix): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create test ref data + ref_obj = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + "targets": {"dag": []}, + "meta": {"author": "test@example.com", "message": "test commit"}, + } + ref_bytes = json.dumps(ref_obj, separators=(",", ":"), sort_keys=True).encode("utf-8") + + # Put the ref + ref_path = "tags/main/test-commit.json" + remote_ops._remote_put_ref(ref_path, ref_bytes) + + # List commits + refs = remote_ops.list("tags") + + # Should have one ref + assert len(refs) == 1 + ref = refs[0] + + # Should have the original fields + assert ref["kind"] == "ref" + assert ref["schema"] == 0 + assert ref["target"] == "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + assert ref["created_at"] == 1234567890 + assert ref["meta"] == {"author": "test@example.com", "message": "test commit"} + + # Should have inferred ref_path + assert ref["ref_path"] == ref_path + + def test_list_filters_prefix_correctly(self, remote_ops): + """Test that list filters refs by prefix correctly.""" + # Clear any existing refs from previous tests + bucket, prefix = remote_bucket_and_prefix_from_env() + refs_prefix = f"{prefix}/refs/" if prefix else "refs/" + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=refs_prefix): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Create refs in different prefixes + tag_ref = { + "kind": "ref", + "schema": 0, + "target": "fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321", + "created_at": 1234567891, + "targets": {"dag": []}, + } + cache_ref = { + "kind": "ref", + "schema": 0, + "target": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "created_at": 1234567892, + "targets": {"dag": []}, + } + + # Put refs in different prefixes + remote_ops._remote_put_ref( + "tags/release/v1.0.0.json", json.dumps(tag_ref, separators=(",", ":"), sort_keys=True).encode("utf-8") + ) + remote_ops._remote_put_ref( + "cache/temp.json", json.dumps(cache_ref, separators=(",", ":"), sort_keys=True).encode("utf-8") + ) + + # List each prefix + tags = remote_ops.list("tags") + cache = remote_ops.list("cache") + + # Should have correct counts + assert len(tags) == 1 + assert len(cache) == 1 + + # Should have correct ref_paths + assert tags[0]["ref_path"] == "tags/release/v1.0.0.json" + assert cache[0]["ref_path"] == "cache/temp.json" + + # Should have correct targets + assert tags[0]["target"] == tag_ref["target"] + assert cache[0]["target"] == cache_ref["target"] + + def test_list_raises_on_invalid_refs(self, remote_ops): + """Test that list fails closed when a ref cannot be decoded.""" + # Clear any existing refs from previous tests + bucket, prefix = remote_bucket_and_prefix_from_env() + refs_prefix = f"{prefix}/refs/" if prefix else "refs/" + paginator = remote_ops.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=refs_prefix): + if "Contents" in page: + for obj in page["Contents"]: + remote_ops.client.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Put a valid ref + valid_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 1234567890, + "targets": {"dag": []}, + } + remote_ops._remote_put_ref( + "tags/main/valid.json", json.dumps(valid_ref, separators=(",", ":"), sort_keys=True).encode("utf-8") + ) + + # Put invalid JSON directly to S3 (bypassing validation) + invalid_json = b'{"invalid": json}' + remote_ops.client.put_object( + Bucket=remote_ops.bucket, + Key=remote_ops._ref_key("tags/main/invalid.json"), + Body=invalid_json, + ) + + with pytest.raises(DmlRepoError, match="Expecting value"): + remote_ops.list("tags") + + def test_list_returns_empty_list_when_no_refs(self, remote_ops): + """Test that list returns empty list when no refs exist for allowed prefix.""" + refs = remote_ops.list("cache") + assert refs == [] + + def test_list_rejects_named_cache_prefixes(self, remote_ops): + """Test that list no longer accepts legacy named cache prefixes.""" + with pytest.raises(DmlRepoError, match="Expected 'tags' or 'cache'"): + remote_ops.list("cache/missing") + + +class TestPrune: + """Tests for the prune functionality.""" + + def test_prune_deletes_old_invoke_blobs_only(self, remote_ops): + """Test that prune deletes old invoke transport blobs.""" + key = f"{remote_ops.prefix}/io/invoke/test.json" if remote_ops.prefix else "io/invoke/test.json" + remote_ops.client.put_object(Bucket=remote_ops.bucket, Key=key, Body=b"{}") + remote_ops._IO_INVOKE_PRUNE_AGE_SECONDS = 0 + + deleted_count = remote_ops.prune() + assert deleted_count == 1 + + with pytest.raises(remote_ops.client.exceptions.ClientError): + remote_ops.client.get_object(Bucket=remote_ops.bucket, Key=key) + + def test_prune_does_not_delete_cache_refs(self, remote_ops): + """Test that prune does not delete cache refs by age metadata.""" + cache_ref = { + "kind": "ref", + "schema": 0, + "target": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "created_at": 0, + "targets": {"dag": []}, + } + remote_ops._remote_put_ref( + "cache/expired.json", + json.dumps(cache_ref, separators=(",", ":"), sort_keys=True).encode("utf-8"), + ) + deleted_count = remote_ops.prune() + assert deleted_count == 0 + assert json.loads(remote_ops._remote_get_ref("cache/expired.json")) == cache_ref + + +class TestE2E: + """End-to-end integration tests: push → pull → gc.""" + + def test_push_pull_gc_removes_unreferenced_manifest_and_closure_objects(self, aws_server, s3): + """Test complete push→pull→gc flow proves compatibility and GC correctness.""" + # Clear the bucket to ensure clean state + bucket, prefix = remote_bucket_and_prefix_from_env() + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + if "Contents" in page: + for obj in page["Contents"]: + s3.delete_object(Bucket=bucket, Key=obj["Key"]) + + # Step 1: Build local DB state with commit + closure objects + # Create test data for commit and blob + commit_data = b'{"kind": "commit", "tree": "tree123", "message": "test commit"}' + blob_data = b"Hello, World! This is test blob content." + + # Compute SHA256 hashes + commit_oid = hashlib.sha256(commit_data).hexdigest() + blob_oid = hashlib.sha256(blob_data).hexdigest() + + # Create local manifest with closure + local_manifest = { + "kind": "local-manifest", + "schema": 0, + "root-ns": "commit", + "root-id": commit_oid, + "closure": { + "commit": {commit_oid: base64.b64encode(commit_data).decode("ascii")}, + "blob": {blob_oid: base64.b64encode(blob_data).decode("ascii")}, + }, + } + + # Create RemoteOps for push + push_db = FakeDb() + push_remote_ops = RemoteOps( + _db=push_db, + client=s3, + bucket=bucket, + prefix=prefix, + ) + + # Step 2: Push to S3 + with patch.object(push_remote_ops, "_local_dump_dict", return_value=local_manifest): + with patch.object( + push_remote_ops, + "_resolve_branch_push_target", + return_value=(Ref(f"commit:{commit_oid}"), f"tags/main/{commit_oid}.json"), + ): + with patch.object(push_remote_ops, "_direct_dag_ids", return_value=[]): + ref_path = push_remote_ops.push("main") + assert ref_path == f"tags/main/{commit_oid}.json" + + # Verify push artifacts exist + assert push_remote_ops._remote_has_cas(commit_oid) + assert push_remote_ops._remote_has_cas(blob_oid) + # Manifest should also exist (computed during push) + remote_manifest_dict, _ = push_remote_ops._build_remote_manifest(local_manifest) + manifest_bytes = json.dumps(remote_manifest_dict, separators=(",", ":"), sort_keys=True).encode("utf-8") + manifest_oid = hashlib.sha256(manifest_bytes).hexdigest() + assert push_remote_ops._remote_has_cas(manifest_oid) + # Ref should exist + ref_bytes = push_remote_ops._remote_get_ref(ref_path) + ref_obj = push_remote_ops._decode_ref(ref_bytes) + assert ref_obj["target"] == manifest_oid + + # Step 3: Create new empty DB for pull + pull_db = FakeDb() # Empty DB + pull_remote_ops = RemoteOps( + _db=pull_db, + client=s3, + bucket=bucket, + prefix=prefix, + ) + + # Step 4: Pull and validate + with patch.object(pull_remote_ops, "_local_has", return_value=False): # Nothing local + with patch.object(pull_remote_ops, "_local_put_head"): + pull_remote_ops.pull(ref_path) + + with pull_remote_ops._tx(readonly=True) as txn: + assert txn.txn.get(Ref(f"commit:{commit_oid}"), raw=True) == base64.b64encode(commit_data).decode("ascii") + assert txn.txn.get(Ref(f"blob:{blob_oid}"), raw=True) == base64.b64encode(blob_data).decode("ascii") + + # Step 5: Delete the created ref, run gc(min_age_seconds=0) + pull_remote_ops._remote_delete_ref(ref_path) + gc_result = pull_remote_ops.gc(min_age_seconds=0) + + # Step 6: Validate CAS objects deleted + # Should have deleted the manifest, commit, and blob objects + assert gc_result["deleted"] >= 3 # At least manifest + commit + blob + assert not pull_remote_ops._remote_has_cas(manifest_oid) + assert not pull_remote_ops._remote_has_cas(commit_oid) + assert not pull_remote_ops._remote_has_cas(blob_oid) diff --git a/tests/integration/internal/test_roundtrip_integration.py b/tests/integration/internal/test_roundtrip_integration.py new file mode 100644 index 0000000..aabdae8 --- /dev/null +++ b/tests/integration/internal/test_roundtrip_integration.py @@ -0,0 +1,182 @@ +"""Integration tests for round-tripping Python values through IndexOps/NodeOps.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from uuid import uuid4 + +import pytest +from hypothesis import given, settings +from hypothesis import strategies as st + +from daggerml._internal._db import Ref +from daggerml._internal.ops.head import HeadOps +from daggerml._internal.ops.index import IndexOps +from daggerml._internal.ops.node import NodeOps +from daggerml._internal.types import DictDatum, ListDatum, Runnable, RunnableDatum, ScalarDatum, Uri + +pytestmark = pytest.mark.slow + +TEST_FN_ADAPTER = str(Path(__file__).resolve().parents[2] / "assets" / "internal_fn" / "python-fork-adapter.py") + + +def _remote_root_from_env() -> str: + return os.environ["DML_REMOTE_ROOT"] + + +@dataclass(frozen=True) +class RunnableSpec: + target: object # str | RunnableSpec + kwargs: dict[str, object] + adapter: str + + +def _scalar_strategy(): + return st.one_of( + st.none(), + st.booleans(), + st.integers(min_value=-(2**31), max_value=2**31 - 1), + st.floats(allow_nan=False, allow_infinity=False, width=32), + st.text(max_size=16), + ) + + +@st.composite +def _value_spec(draw, depth: int = 0): + if depth >= 3: + return draw(_scalar_strategy()) + kind = draw(st.sampled_from(["scalar", "list", "dict", "runnable"])) + if kind == "scalar": + return draw(_scalar_strategy()) + if kind == "list": + return draw(st.lists(_value_spec(depth=depth + 1), max_size=4)) + if kind == "dict": + return draw(st.dictionaries(st.text(min_size=1, max_size=8), _value_spec(depth=depth + 1), max_size=4)) + + # runnable + nested_target = False + target = ( + draw(_value_spec(depth=depth + 1).filter(lambda x: isinstance(x, RunnableSpec))) + if nested_target + else draw( + st.one_of( + st.sampled_from(["daggerml:list", "daggerml:dict", "daggerml:get", "file:///tmp/fn.py"]), + st.text(min_size=1, max_size=24), + ) + ) + ) + kwargs = draw(st.dictionaries(st.text(min_size=1, max_size=6), _value_spec(depth=depth + 1), max_size=3)) + return RunnableSpec( + target=target, + kwargs=kwargs, + adapter=draw(st.sampled_from(["", TEST_FN_ADAPTER, "custom-adapter"])), + ) + + +def _build_target_ref(spec_target: object, ops: IndexOps, index_ref: Ref) -> Ref: + if isinstance(spec_target, RunnableSpec): + nested = _materialize(spec_target, ops, index_ref) + nested_node = ops.put_literal(index_ref, nested) + with ops._tx(readonly=True) as txn: + return txn.get(nested_node).datum_ref(txn) + + uri_node = ops.put_literal(index_ref, Uri(str(spec_target))) + with ops._tx(readonly=True) as txn: + return txn.get(uri_node).datum_ref(txn) + + +def _materialize(value: object, ops: IndexOps, index_ref: Ref): + if isinstance(value, RunnableSpec): + target_ref = _build_target_ref(value.target, ops, index_ref) + kwargs: dict[str, Ref] = {} + for k, v in value.kwargs.items(): + vv = _materialize(v, ops, index_ref) + node_ref = ops.put_literal(index_ref, vv) + with ops._tx(readonly=True) as txn: + kwargs[k] = txn.get(node_ref).datum_ref(txn) + kwargs_node_ref = ops.put_literal(index_ref, kwargs) + with ops._tx(readonly=True) as txn: + kwargs_ref = txn.get(kwargs_node_ref).datum_ref(txn) + return RunnableDatum(target=target_ref, sub=None, kwargs=kwargs_ref, adapter=value.adapter) + if isinstance(value, list): + return [_materialize(v, ops, index_ref) for v in value] + if isinstance(value, dict): + return {k: _materialize(v, ops, index_ref) for k, v in value.items()} + return value + + +def _canonical_from_ref(txn, ref: Ref): + if ref.nss()[0] == "node": + node = txn.get(ref) + return _canonical_from_ref(txn, node.datum_ref(txn)) + + datum = txn.get(ref) + if isinstance(datum, ScalarDatum): + return datum.data + if isinstance(datum, ListDatum): + return [_canonical_from_ref(txn, x) for x in datum.data] + if isinstance(datum, DictDatum): + return {k: _canonical_from_ref(txn, v) for k, v in datum.data.items()} + if isinstance(datum, Uri): + return {"__uri__": datum.uri} + if isinstance(datum, RunnableDatum): + kwargs_datum: DictDatum = txn.get(datum.kwargs) + return { + "__runnable__": { + "adapter": datum.adapter, + "target": _canonical_from_ref(txn, datum.target), + "kwargs": {k: _canonical_from_ref(txn, v) for k, v in kwargs_datum.data.items()}, + } + } + raise AssertionError(f"Unsupported datum type: {type(datum).__name__}") + + +def _canonical_value(txn, value): + if isinstance(value, Ref): + return _canonical_from_ref(txn, value) + if isinstance(value, RunnableDatum): + kwargs_datum: DictDatum = txn.get(value.kwargs) + return { + "__runnable__": { + "adapter": value.adapter, + "target": _canonical_from_ref(txn, value.target), + "kwargs": {k: _canonical_from_ref(txn, v) for k, v in kwargs_datum.data.items()}, + } + } + if isinstance(value, Runnable): + return { + "__runnable__": { + "adapter": value.adapter, + "target": _canonical_value(txn, value.target), + "kwargs": {k: _canonical_value(txn, v) for k, v in value.kwargs.items()}, + } + } + if isinstance(value, Uri): + return {"__uri__": value.uri} + if isinstance(value, list): + return [_canonical_value(txn, x) for x in value] + if isinstance(value, dict): + return {k: _canonical_value(txn, v) for k, v in value.items()} + return value + + +@given(payload=_value_spec()) +@settings(max_examples=30, deadline=None) +def test_put_literal_unroll_roundtrip_with_nested_runnables(temp_bo, payload): + head_ops = HeadOps(_db=temp_bo._db) + index_ops = IndexOps(_db=temp_bo._db, remote_root=_remote_root_from_env()) + node_ops = NodeOps(_db=temp_bo._db) + + branch = head_ops.create_branch(f"rt-{uuid4().hex}") + index_ref = index_ops.create(head=branch) + try: + materialized = _materialize(payload, index_ops, index_ref) + root_ref = index_ops.put_literal(index_ref, materialized, name="root") + result = node_ops.unroll(root_ref) + with index_ops._tx(readonly=True) as txn: + assert _canonical_value(txn, result) == _canonical_value(txn, materialized) + finally: + index_ops.delete(index_ref) + head_ops.delete_branch(branch) diff --git a/tests/integration/runtime/test_exec_state_integration.py b/tests/integration/runtime/test_exec_state_integration.py new file mode 100644 index 0000000..beb0ce4 --- /dev/null +++ b/tests/integration/runtime/test_exec_state_integration.py @@ -0,0 +1,392 @@ +"""Tests for daggerml._internal.exec_state (S3-backed ExecutionState).""" + +from __future__ import annotations + +import os +import time +from unittest.mock import patch + +import boto3 +import pytest + +from daggerml._internal.exec_state import LOCK_TTL, AdapterIO, ExecutionState +from daggerml._internal.types import DmlRepoError + +pytestmark = pytest.mark.slow + +BUCKET = "test-exec-state-bucket" +REMOTE_ROOT = f"s3://{BUCKET}/test-prefix" + + +# --------------------------------------------------------------------------- +# Module-scoped moto S3 server (reuse pattern from tests/conftest.py) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def _s3_server(): + with patch.dict(os.environ): + for key in list(os.environ.keys()): + if key.startswith("AWS_"): + del os.environ[key] + from moto.server import ThreadedMotoServer + + server = ThreadedMotoServer(port=0, verbose=False) + server.start() + host, port = server.get_host_and_port() + try: + yield { + "endpoint": f"http://{host}:{port}", + "envvars": { + "AWS_ACCESS_KEY_ID": "test", + "AWS_SECRET_ACCESS_KEY": "test", + "AWS_REGION": "us-east-1", + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_ENDPOINT_URL": f"http://{host}:{port}", + }, + } + finally: + server.stop() + + +@pytest.fixture(autouse=True) +def s3_env(_s3_server): + """Set up S3 bucket for each test (clean state).""" + with patch.dict(os.environ, _s3_server["envvars"]): + boto3.setup_default_session() + s3 = boto3.client("s3", endpoint_url=_s3_server["endpoint"]) + try: + s3.create_bucket(Bucket=BUCKET) + except Exception: + # Bucket exists — delete all objects to start clean + resp = s3.list_objects_v2(Bucket=BUCKET) + for obj in resp.get("Contents", []): + s3.delete_object(Bucket=BUCKET, Key=obj["Key"]) + yield + + +def _es(cache_key: str) -> ExecutionState: + return ExecutionState(cache_key, remote_root=REMOTE_ROOT) + + +# --------------------------------------------------------------------------- +# 2.8 Constructor — missing / invalid remote_root raises DmlRepoError +# --------------------------------------------------------------------------- + + +class TestConstructor: + def test_missing_remote_root_raises(self): + with pytest.raises(DmlRepoError, match="s3://"): + ExecutionState("ck", remote_root="not-s3://foo") + + def test_empty_bucket_raises(self): + with pytest.raises(DmlRepoError): + ExecutionState("ck", remote_root="s3:///prefix") + + def test_non_s3_scheme_raises(self): + with pytest.raises(DmlRepoError): + ExecutionState("ck", remote_root="gs://bucket/prefix") + + def test_valid_construction(self): + es = _es("ck-valid") + assert es.cache_key == "ck-valid" + assert es._bucket == BUCKET + assert "dml/locks/ck-valid.json" in es._lock_key + + def test_key_derived_from_prefix(self): + es = ExecutionState("ck", remote_root="s3://mybucket/my/prefix") + assert es._lock_key == "my/prefix/dml/locks/ck.json" + + def test_key_no_prefix(self): + es = ExecutionState("ck", remote_root="s3://mybucket") + assert es._lock_key == "dml/locks/ck.json" + + +# --------------------------------------------------------------------------- +# 2.2 lock() creates file when absent, returns True +# --------------------------------------------------------------------------- + + +class TestLockAbsent: + def test_lock_creates_file_returns_true(self): + es = _es("lock-absent-1") + assert es.lock() is True + assert es._lock_token is not None + + def test_lock_file_exists_after_lock(self): + es = _es("lock-absent-2") + assert es.lock() is True + record = es._get_object() + assert record is not None + assert "lock_token" in record + assert "lock_expires_ts" in record + + +# --------------------------------------------------------------------------- +# 2.3 lock() returns False when non-expired lock exists +# --------------------------------------------------------------------------- + + +class TestLockHeld: + def test_lock_returns_false_when_held(self): + es1 = _es("lock-held-1") + es2 = _es("lock-held-1") + assert es1.lock() is True + assert es2.lock() is False + + def test_lock_token_unchanged_on_failure(self): + es1 = _es("lock-held-2") + es2 = _es("lock-held-2") + assert es1.lock() is True + assert es2.lock() is False + assert es2._lock_token is None + + +# --------------------------------------------------------------------------- +# 2.4 lock() steals expired lock (DELETE + re-PUT), returns True +# --------------------------------------------------------------------------- + + +class TestLockExpired: + def test_lock_steals_expired(self): + es1 = _es("lock-exp-1") + assert es1.lock(ttl=0.01) is True + time.sleep(0.05) + es2 = _es("lock-exp-1") + assert es2.lock() is True + assert es2._lock_token is not None + + def test_stolen_lock_has_new_token(self): + es1 = _es("lock-exp-2") + assert es1.lock(ttl=0.01) is True + old_record = es1._get_object() + time.sleep(0.05) + es2 = _es("lock-exp-2") + assert es2.lock() is True + new_record = es2._get_object() + assert old_record is not None and new_record is not None + assert new_record["lock_token"] != old_record["lock_token"] + + +# --------------------------------------------------------------------------- +# 2.5 lock() returns False on 412 concurrent conflict +# This is exercised indirectly via moto; we simulate by monkeypatching. +# --------------------------------------------------------------------------- + + +class TestLock412: + def test_lock_returns_false_on_412(self, monkeypatch): + """Simulate a 412 PreconditionFailed from S3.""" + import botocore.exceptions + + def _fake_put(*args, **kwargs): + error_response = {"Error": {"Code": "PreconditionFailed", "Message": "precondition failed"}} + raise botocore.exceptions.ClientError(error_response, "PutObject") + + es = _es("lock-412-1") + monkeypatch.setattr(es, "_put_object_if_absent", lambda _: False) + assert es.lock() is False + + +# --------------------------------------------------------------------------- +# 2.6 unlock() deletes the file +# --------------------------------------------------------------------------- + + +class TestUnlock: + def test_unlock_deletes_file(self): + es = _es("unlock-1") + assert es.lock() is True + es.unlock() + assert es._get_object() is None + + def test_unlock_clears_token(self): + es = _es("unlock-2") + assert es.lock() is True + es.unlock() + assert es._lock_token is None + + +# --------------------------------------------------------------------------- +# 2.7 unlock() is idempotent when file absent +# --------------------------------------------------------------------------- + + +class TestUnlockIdempotent: + def test_unlock_no_op_when_absent(self): + es = _es("unlock-idem-1") + # Never locked — should not raise + es.unlock() + + def test_double_unlock_no_error(self): + es = _es("unlock-idem-2") + assert es.lock() is True + es.unlock() + es.unlock() # second call is a no-op + + +# --------------------------------------------------------------------------- +# LOCK_TTL constant +# --------------------------------------------------------------------------- + + +def test_lock_ttl_is_positive(): + assert LOCK_TTL > 0 + + +def test_active_execution_pointer_round_trip(): + es = _es("active-1") + assert es.read_active_execution_id() is None + assert es.create_active_execution("exec-3") is True + assert es.read_active_execution_id() == "exec-3" + es.delete_active_execution() + assert es.read_active_execution_id() is None + + +def test_execution_record_is_create_only(): + es = _es("record-1") + record = { + "execution_id": "exec-1", + "cache_key": "record-1", + "lifecycle": "running", + "updated_at": 1, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + assert es.create_execution_record(record) is True + assert es.read_execution_record("exec-1") == record + assert es.create_execution_record(record) is False + assert es._key_for_execution("exec-1") == "test-prefix/dml/exec/state/exec-1.json" + + +def test_execution_record_updates_merge_monotonically(): + es = _es("record-2") + created = { + "execution_id": "exec-0", + "cache_key": "record-2", + "lifecycle": "running", + "updated_at": 10, + "spawned_execution_ids": [], + "cancellation_requested_by": None, + } + assert es.create_execution_record(created) + merged = es.update_execution_record( + { + "execution_id": "exec-0", + "cache_key": "record-2", + "lifecycle": "cancel-pending", + "updated_at": 11, + "spawned_execution_ids": ["exec-2"], + "cancellation_requested_by": "user@example.com", + } + ) + assert merged["spawned_execution_ids"] == ["exec-2"] + assert merged["lifecycle"] == "cancel-pending" + assert merged["cancellation_requested_by"] == "user@example.com" + + +def test_launch_state_round_trip(): + es = _es("launch-1") + launch_state = { + "execution_id": "exec-launch-1", + "cache_key": "launch-1", + "resume_state": {"pid": 1}, + "created_at": 1, + } + assert es.create_launch_state(launch_state) is True + assert es.read_launch_state("exec-launch-1") == launch_state + + +def test_call_edge_records_are_canonical_and_idempotent(): + es = _es("callee") + es.record_execution_dependency(caller_execution_id="caller-a", callee_execution_id="callee") + es.record_execution_dependency(caller_execution_id="caller-a", callee_execution_id="callee") + edge, _ = es._read_json(es._key_for_edge("callee", "caller-a")) + assert edge == {"caller_execution_id": "caller-a", "callee_execution_id": "callee"} + + +def test_invalidation_record_is_create_only(): + es = _es("invalidate") + assert es.create_invalidation_record( + execution_id="exec-9", + cache_key="invalidate", + requested_by="user@example.com", + requested_at=123, + ) + assert not es.create_invalidation_record( + execution_id="exec-9", + cache_key="invalidate", + requested_by="user@example.com", + requested_at=123, + ) + + +# --------------------------------------------------------------------------- +# AdapterIO +# --------------------------------------------------------------------------- + + +class TestAdapterIO: + def test_input_uri_derived_correctly(self): + es = _es("io-ck") + io = es.adapter_io("exec-uuid", "local:docker") + assert io.input_uri == f"s3://{BUCKET}/test-prefix/dml/io/io-ck/exec-uuid/local:docker/input.json" + + def test_output_uri_derived_correctly(self): + es = _es("io-ck") + io = es.adapter_io("exec-uuid", "local:docker") + assert io.output_uri == f"s3://{BUCKET}/test-prefix/dml/io/io-ck/exec-uuid/local:docker/output.json" + + def test_uri_properties_make_no_s3_call(self, monkeypatch): + calls = [] + es = _es("io-no-s3") + monkeypatch.setattr(es, "_put_object", lambda *a, **kw: calls.append(("put", a, kw))) + monkeypatch.setattr(es, "_get_object_bytes", lambda *a, **kw: calls.append(("get", a, kw)) or None) + io = es.adapter_io("exec-uuid", "local:docker") + _ = io.input_uri + _ = io.output_uri + assert calls == [] + + def test_write_input_stores_data_and_returns_input_uri(self): + es = _es("io-write") + io = es.adapter_io("exec-id-write", "lambda:batch") + uri = io.write_input(b'{"payload": 1}') + assert uri == io.input_uri + # Read back via raw S3 to confirm + result = es._get_object_bytes(io._input_key) + assert result is not None + assert result[0] == b'{"payload": 1}' + + def test_read_output_returns_none_when_absent(self): + es = _es("io-read-absent") + io = es.adapter_io("exec-id-absent", "lambda:batch") + assert io.read_output() is None + + def test_read_output_returns_bytes_when_present(self): + es = _es("io-read-present") + io = es.adapter_io("exec-id-present", "lambda:batch") + es._put_object(io._output_key, b'{"status":"succeeded"}') + assert io.read_output() == b'{"status":"succeeded"}' + + def test_adapter_io_factory_returns_adapter_io_instance(self): + es = _es("io-factory") + io = es.adapter_io("exec-x", "local:docker") + assert isinstance(io, AdapterIO) + + def test_paths_scoped_within_fn_exec_io(self): + es = _es("io-scope") + io = es.adapter_io("exec-y", "local:docker") + assert "dml/io/" in io.input_uri + assert "dml/io/" in io.output_uri + + def test_different_names_produce_different_paths(self): + es = _es("io-names") + io1 = es.adapter_io("exec-z", "local:docker") + io2 = es.adapter_io("exec-z", "lambda:batch") + assert io1.input_uri != io2.input_uri + assert io1.output_uri != io2.output_uri + + def test_no_prefix_remote_root(self): + es = ExecutionState("io-np", remote_root=f"s3://{BUCKET}") + io = es.adapter_io("exec-np", "local:docker") + assert io.input_uri == f"s3://{BUCKET}/dml/io/io-np/exec-np/local:docker/input.json" diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index 6ec6e6c..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,360 +0,0 @@ -import os -import re -from tempfile import TemporaryDirectory -from unittest import TestCase - -import pytest - -from daggerml.core import Dag, Dml, Error, Executable, Node, from_data - -SUM = Executable("./tests/assets/fns/sum.py", adapter="dml-python-fork-adapter") -ASYNC = Executable("./tests/assets/fns/async.py", adapter="dml-python-fork-adapter") -ENVVARS = Executable("./tests/assets/fns/envvars.py", adapter="dml-python-fork-adapter") -TIMEOUT = Executable("./tests/assets/fns/timeout.py", adapter="dml-python-fork-adapter") - - -class TestSetAttrs: - @pytest.mark.parametrize("x", [[0], (0,), [], ["asdf", None]]) # none contain 1 - def test_list_attrs(self, x, dml): - dag = dml.new("d0", "d0") - n0 = dag.put(x) - assert n0.contains(1).value() is False - assert 1 not in n0 - assert len(n0) == len(x) - for index, item_node in enumerate(n0): - item = x[index] - assert item_node.value() == item - assert n0.contains(item).value() is True - assert item in n0 - assert n0[index].value() == item - assert n0.append(1).value() == [*x, 1] - assert n0.conj(1).value() == [*x, 1] - - @pytest.mark.parametrize("x", [{}, {"a": 1}, {"x": 42, "y": {"k0": None}}]) # none contain 'z' - def test_dict_attrs(self, x, dml): - dag = dml.new("d0", "d0") - n0 = dag.put(x) - assert n0.contains("z").value() is False - assert "z" not in n0 - assert len(n0) == len(x) - assert n0.get("z", default=123).value() == 123 - for key in n0: - item = x[key] - assert n0[key].value() == item - assert n0.contains(key).value() is True - assert key in n0 - assert n0.get(key).value() == item - assert [(k, v.value()) for k, v in n0.items()] == list(x.items()) - assert n0.keys() == list(x.keys()) - assert [x.value() for x in n0.values()] == list(x.values()) - assert n0.assoc("y", 3).value() == {**x, "y": 3} - assert n0.update({"z": 1, "a": 2}).value() == {**x, "z": 1, "a": 2} - - def test_load_reboot(self, dml): - with dml.new("d0", "d0") as dag: - dag.put(42, name="n0") - dag.commit("foo") - with dml.new("d1", "d1") as dag: - node = dag.load("d0", name="n1") - assert node.dag == dag - assert node.value() == "foo" - assert node.load().n0.value() == 42 - assert dag.load("d0", key="n0").value() == 42 - - def test_node_call_w_literal_deps(self, dml): - nums = [1, 2, 3] - dag = dml.new("d0", "d0") - fn = Executable( - "./tests/assets/fns/sum.py", - adapter="dml-python-fork-adapter", - prepop={"x": 10}, - ) - result = dag.call(fn, *nums) - assert result.value() == sum(nums) - assert "x" in result.load().keys() - assert result.load().x.value() == 10 - - def test_node_call_w_node_deps(self, dml): - nums = [1, 2, 3] - dag = dml.new("d0", "d0") - fn = Executable( - "./tests/assets/fns/sum.py", - adapter="dml-python-fork-adapter", - prepop={"x": dag.put(10)}, - ) - result = dag.call(fn, *nums) - assert result.value() == sum(nums) - assert "x" in result.load().keys() - assert result.load().x.value() == 10 - - def test_node_call_w_kwarg(self, dml): - nums = [1, 2, 3] - dag = dml.new("d0", "d0") - fn = Executable( - "./tests/assets/fns/sum.py", - adapter="dml-python-fork-adapter", - prepop={"x": 10}, - ) - result = dag.call(fn, *nums, x=100) - assert result.value() == sum(nums) - assert "x" in result.load().keys() - assert result.load().x.value() == 100 - - def test_bad_kwarg(self, dml): - nums = [1, 2, 3] - dag = dml.new("d0", "d0") - fn = Executable( - "./tests/assets/fns/sum.py", - adapter="dml-python-fork-adapter", - prepop={"x": 10}, - ) - msg = re.escape(r"Function called with extraneous kwargs (not in `fn.prepop`): ['y']") - with pytest.raises(Error, match=msg): - dag.call(fn, *nums, y=100) - - def test_node_call(self, dml): - nums = [1, 2, 3] - dag = dml.new("d0", "d0") - fn = dag.put(SUM) - result = fn(*nums) - assert result.value() == sum(nums) - - def test_load_recursing(self, dml): - nums = [1, 2, 3] - with dml.new("d0", "d0") as dag: - dag.commit(dag.call(SUM, *nums, name="n1")) - d1 = dml.new("d1", "d1") - n1 = d1.put(dml.load("d0").n1, name="n1_1") - assert n1.dag == d1 - n2 = n1.load().n1.load().num_args - assert n2.value() == len(nums) - assert n1.value() == sum(nums) - - def test_caching(self): - nums = [1, 2, 3] - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - config_dir = dml.config_dir - with dml.new("d0", "d0") as d1: - n1 = d1.call(SUM, *nums) - assert n1.value() == sum(nums) - assert isinstance(n1.load(), Dag) - uid = n1.load().uuid.value() - with Dml.temporary(cache_path=cache_path) as dml: - assert dml.config_dir != config_dir, "Config dir should not be the same" - with dml.new("d1", "d0") as d1: - n1 = d1.call(SUM, *nums) - uid1 = n1.load().uuid.value() - assert uid == uid1, "Cached dag should have the same UUID" - - def test_no_caching(self): - nums = [1, 2, 3] - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - config_dir = dml.config_dir - with dml.new("d0", "d0") as d1: - n1 = d1.call(SUM, *nums) - uid = n1.load().uuid.value() - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - assert dml.config_dir != config_dir, "Config dir should not be the same" - with dml.new("d1", "d0") as d1: - n1 = d1.call(SUM, *nums) - uid1 = n1.load().uuid.value() - assert uid != uid1, "Cached dag should have the same UUID" - - def test_nodemap(self, dml): - dag = dml.new("d0", "d0") - dag.a = 23 - node = dag.put(42, name="b") - other = dag.put(420) - assert dag.a.value() == 23 - assert list(dag) == ["a", "b"] - dag.commit([node, other]) - - def test_set_attrs(self, dml): - dag = dml.new("d0", "d0") - n0 = dag.put({0}) - assert n0.contains(1).value() is False - assert n0.contains(0).value() is True - assert 0 in n0 - n1 = n0.append(1) - assert n1.value() == {0, 1} - - def test_load_constructors(self, dml): - dag = dml.new("d0", "d0") - l0 = dag.put(42) - c0 = dag.put({"a": 1, "b": [l0, "23"]}) - assert c0.backtrack("b", 0) == l0 - assert c0.backtrack("b", 1).value() == "23" - assert c0.backtrack("b").backtrack(0) == l0 - assert c0["b"][0] != l0 - c1 = c0["b"] - assert c1.backtrack() == c0 - assert c1.backtrack().backtrack("b", 0) == l0 - - def test_fn_ok_cache(self, dml): - with dml.new("d0", "d0") as dag: - nodes = [dag.call(SUM, i, 1, 2) for i in range(2)] # unique function applications - dag.call(SUM, 0, 1, 2) # add a repeat outside so `nodes` is still unique - dag.commit(nodes[0]) - assert dag.result.value() == 3 - cache_list = dml("cache", "list", as_text=True) # response is jsonlines format - assert len([x for x in cache_list if x.rstrip() == "{"]) == 2 # this gets us unique maps - - def test_async_fn_ok(self, dml): - debug_file = os.path.join(dml.config_dir, "debug") - with dml.new("d0", "d0") as dag: - n1 = dag.call(ASYNC, 1, 2, 3) - dag.commit(n1) - assert n1.value() == 6 - with open(debug_file, "r") as f: - assert len([1 for _ in f]) == 2 - - def test_async_fn_error(self, dml): - with pytest.raises(Error, match=r".*unsupported operand type.*"): - with dml.new("d0", "d0") as dag: - dag.call(ASYNC, 1, 2, "asdf") - info = [x for x in dml("dag", "list") if x["name"] == "d0"] - assert len(info) == 1 - - def test_async_fn_timeout(self, dml): - with pytest.raises(TimeoutError): - with dml.new("d0", "d0") as dag: - dag.call(TIMEOUT, 1, 2, 3, timeout=1000) - - def test_load(self, dml): - with dml.new("d0", "d0") as dag: - dag.put(42, name="n0") - dag.commit("foo") - dl = dml.load("d0") - assert isinstance(dl, Dag) - assert dl.n0.value() == 42 - assert dl.result.value() == "foo" - - def test_doc(self, dml): - dag = dml.new("d0", "d0") - n = dag.put(42, name="n0", doc="The answer to life, the universe, and everything") - assert n.__doc__ == "The answer to life, the universe, and everything" - - -class TestBasic(TestCase): - def test_init(self): - with Dml.temporary() as dml: - status = dml("status") - self.assertDictEqual( - {k: v for k, v in status.items() if k != "cache_path"}, - { - "repo": dml.kwargs.get("repo"), - "branch": dml.kwargs.get("branch"), - "user": dml.kwargs.get("user"), - "config_dir": dml.kwargs.get("config_dir"), - "project_dir": dml.kwargs.get("project_dir"), - }, - ) - assert status["cache_path"].startswith(os.path.expanduser("~")) - self.assertEqual(dml.envvars["DML_CONFIG_DIR"], dml.kwargs.get("config_dir")) - self.assertEqual( - {k: v for k, v in dml.envvars.items() if k != "DML_CACHE_PATH"}, - { - "DML_REPO": dml.kwargs.get("repo"), - "DML_BRANCH": dml.kwargs.get("branch"), - "DML_USER": dml.kwargs.get("user"), - "DML_CONFIG_DIR": dml.kwargs.get("config_dir"), - "DML_PROJECT_DIR": dml.kwargs.get("project_dir"), - }, - ) - - def test_init_kwargs(self): - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(repo="does-not-exist", branch="unique-name", cache_path=cache_path) as dml: - self.assertDictEqual( - dml("status"), - { - "repo": "does-not-exist", - "branch": "unique-name", - "user": dml.kwargs.get("user"), - "config_dir": dml.kwargs.get("config_dir"), - "project_dir": dml.kwargs.get("project_dir"), - "cache_path": dml.kwargs.get("cache_path"), - }, - ) - self.assertEqual(dml.envvars["DML_CONFIG_DIR"], dml.kwargs.get("config_dir")) - self.assertEqual( - dml.envvars, - { - "DML_REPO": "does-not-exist", - "DML_BRANCH": "unique-name", - "DML_USER": dml.kwargs.get("user"), - "DML_CONFIG_DIR": dml.kwargs.get("config_dir"), - "DML_PROJECT_DIR": dml.kwargs.get("project_dir"), - "DML_CACHE_PATH": cache_path, - }, - ) - - def test_message_handler_load(self): - local_value = None - - def message_handler(dump): - nonlocal local_value - local_value = dump - - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - d0 = dml.new("d0", "d0", message_handler=message_handler) - data = {"key": "value", "list": [1, 2, 3], "dict": {"a": 1, "b": 2}, "resource": SUM} - n0 = d0.put(data, name="n0") - d0.commit(n0) - assert isinstance(local_value, str) - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - ref = from_data(dml("ref", "load", local_value)) - assert len(dml("dag", "describe", ref.to)["nodes"]) == 1 - - def test_dag(self): - local_value = None - - def message_handler(dump): - nonlocal local_value - local_value = dump - - with TemporaryDirectory(prefix="dml-cache-") as cache_path: - with Dml.temporary(cache_path=cache_path) as dml: - d0 = dml.new("d0", "d0", message_handler=message_handler) - self.assertIsInstance(d0, Dag) - n0 = d0.put([42], name="n0") - assert isinstance(n0, Node) - self.assertIsInstance(n0, Node) - self.assertEqual(n0.value(), [42]) - assert len(d0) == 1 - self.assertEqual(len(n0), 1) - self.assertEqual(n0.type, "list") - d0["x0"] = n0 - self.assertEqual(d0["x0"], n0) - self.assertEqual(d0.x0, n0) - d0.x1 = 42 - self.assertEqual(d0["x1"].value(), 42) - self.assertEqual(d0.x1.value(), 42) - d0.n1 = n0[0] - self.assertIsInstance(n0[0], Node) - self.assertEqual([x.value() for x in n0], [d0.n1.value()]) - self.assertEqual(d0.n1.value(), 42) - d0.n2 = {"x": n0, "y": "z"} - self.assertNotEqual(d0.n2["x"], n0) - self.assertEqual(d0.n2["x"].value(), n0.value()) - d0.n3 = list(d0.n2.items()) - self.assertIsInstance([x for x in d0.n3], list) - self.assertDictEqual( - {k: v.value() for k, v in d0.n2.items()}, - {"x": n0.value(), "y": "z"}, - ) - d0.n4 = [1, 2, 3, 4, 5] - d0.n5 = d0.n4[1:] - self.assertListEqual([x.value() for x in d0.n5], [2, 3, 4, 5]) - d0.commit(n0) - self.assertIsInstance(local_value, str) - dag = dml("dag", "list")[0] - self.assertEqual(dag["result"], n0.ref.to) - assert len(dml("dag", "list", "--all")) > 1 - dml("dag", "delete", dag["name"], "Deleting dag") - dml("repo", "gc", as_text=True)