Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
44ac109
Add adapter-level retry for transient HTTP errors in direct engine
denik May 26, 2026
c955b5c
update tests
denik May 27, 2026
a176d26
tests
denik May 27, 2026
4967558
Scope retry to 504+!IsRetriable; add opt-in retrySafe for idempotent …
denik May 27, 2026
4edfda4
clean up
denik May 27, 2026
d5e1599
rename doRefresh to doRead
denik May 27, 2026
f51f6ba
Move retry logic from adapter to call sites in apply.go and bundle_pl…
denik May 27, 2026
05f5bd7
Log and fall through on transient errors in WaitAfterCreate/WaitAfter…
denik May 27, 2026
82e3e2c
Rename retryErr to retryOnTransientErr
denik May 27, 2026
73d0b05
Fix log prefix in Destroy: "destroying" not "deploying"
denik May 27, 2026
c68f966
Reduce default retry interval from 30s to 15s
denik May 27, 2026
1720072
Use errors.AsType[T] instead of errors.As for Go 1.26+ compatibility
denik May 28, 2026
dd2b0e4
Revert unrelated cosmetic changes in adapter.go
denik May 28, 2026
fe5f124
Inline IsRetrySafe: export RetrySafeError, use errors.AsType at call …
denik May 28, 2026
5219571
Retry WaitAfterCreate/WaitAfterUpdate on transient errors instead of …
denik May 28, 2026
564f68e
Guard apiErr nil in retryWith log line
denik May 28, 2026
0059f64
Export FaultRules/FaultRule and add unit tests for fault injection logic
denik May 28, 2026
95d6a8a
restructure
denik May 28, 2026
bf19721
Mark acceptance/bin/fault.py as executable
denik May 28, 2026
80a9f58
Fix uniq -c count width portability in 504/plan acceptance test
denik May 28, 2026
e7b037f
Add NEXT_CHANGELOG entry for 504 retry
denik May 28, 2026
42f0cc1
add a comment about DoCreate opt-in
denik May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
* `experimental open` now opens every DABs resource type that has a workspace URL, picking up `catalogs`, `schemas`, `volumes`, `database_instances`, `database_catalogs`, `synced_database_tables`, `postgres_catalogs`, `postgres_synced_tables`, `quality_monitors`, `vector_search_endpoints`, and `vector_search_indexes` ([#5346](https://github.com/databricks/cli/pull/5346)).

### Bundles
* Retry transient HTTP 504 Gateway Timeout errors in direct deployment engine ([#5349](https://github.com/databricks/cli/pull/5349)).

### Dependency updates
51 changes: 51 additions & 0 deletions acceptance/bin/fault.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
"""Set up a fault rule on the testserver for the current test token.

Usage: fault.py PATTERN STATUS_CODE OFFSET TIMES

PATTERN HTTP method and path, supports trailing * wildcard,
e.g. "PUT /api/2.0/permissions/pipelines/*"
STATUS_CODE HTTP status code to return, e.g. 504
OFFSET number of requests to let through before fault starts
TIMES number of times to return the fault response

The rule is scoped to the current DATABRICKS_TOKEN so it only affects
the test that registers it, even when tests share a server.
"""

import json
import os
import sys
import urllib.request

host = os.environ.get("DATABRICKS_HOST", "")
token = os.environ.get("DATABRICKS_TOKEN", "")

if not host:
print("DATABRICKS_HOST not set", file=sys.stderr)
sys.exit(1)

if len(sys.argv) != 5:
print(f"usage: {sys.argv[0]} PATTERN STATUS_CODE OFFSET TIMES", file=sys.stderr)
sys.exit(1)

pattern, status_code, offset, times = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
body = '{"error_code":"INJECTED","message":"Fault injected by test."}'

data = json.dumps(
{
"pattern": pattern,
"status_code": status_code,
"body": body,
"offset": offset,
"times": times,
}
).encode()

req = urllib.request.Request(
f"{host}/__testserver/fault",
data=data,
headers={"Content-Type": "application/json", "Authorization": f"Bearer {token}"},
method="POST",
)
urllib.request.urlopen(req)
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bundle:
name: test-bundle

resources:
pipelines:
foo:
name: foo
permissions:
- level: CAN_VIEW
user_name: viewer@example.com

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files...
Deploying resources...
Warn: deploying resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from PUT /api/2.0/permissions/pipelines/[UUID]
Updating deployment state...
Deployment complete!

>>> print_requests.py //api/2.0/permissions/pipelines
"PUT /api/2.0/permissions/pipelines/[UUID]"
"PUT /api/2.0/permissions/pipelines/[UUID]"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Inject a single 504 on the first permissions PUT to simulate a transient error.
# Permissions Set is idempotent, so DoCreate opts in via retrySafe and the deploy succeeds.
fault.py "PUT /api/2.0/permissions/pipelines/*" 504 0 1

$CLI bundle deploy

# Two PUT requests should appear: the initial 504 and the successful retry.
trace print_requests.py //api/2.0/permissions/pipelines | jq '.method + " " + .path'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
RecordRequests = true
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bundle:
name: test-bundle

resources:
pipelines:
foo:
name: foo
permissions:
- level: CAN_VIEW
user_name: viewer@example.com

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

>>> [CLI] bundle plan
Warn: planning resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from GET /api/2.0/permissions/pipelines/[UUID]
Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged

>>> print_requests.py //api/2.0/permissions/pipelines --get --oneline
2 {"method": "GET", "path": "/api/2.0/permissions/pipelines/[UUID]"}

>>> [CLI] bundle plan
Warn: planning resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from GET /api/2.0/permissions/pipelines/[UUID]
Warn: planning resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from GET /api/2.0/permissions/pipelines/[UUID]
Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged
3 {"method": "GET", "path": "/api/2.0/permissions/pipelines/[UUID]"}

>>> musterr [CLI] bundle plan
Warn: planning resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from GET /api/2.0/permissions/pipelines/[UUID]
Warn: planning resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from GET /api/2.0/permissions/pipelines/[UUID]
Error: cannot plan resources.pipelines.foo.permissions: reading id="/pipelines/[UUID]": Fault injected by test. (504 INJECTED)

Endpoint: GET [DATABRICKS_URL]/api/2.0/permissions/pipelines/[UUID]?
HTTP Status: 504 Gateway Timeout
API error_code: INJECTED
API message: Fault injected by test.

Error: planning failed

3 {"method": "GET", "path": "/api/2.0/permissions/pipelines/[UUID]"}
15 changes: 15 additions & 0 deletions acceptance/bundle/resources/permissions/pipelines/504/plan/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Deploy first so the permissions resource exists; plan reads it on subsequent runs.
$CLI bundle deploy
rm -f out.requests.txt

fault.py "GET /api/2.0/permissions/pipelines/*" 504 0 1
trace $CLI bundle plan
trace print_requests.py //api/2.0/permissions/pipelines --get --oneline | uniq -c | sed 's/^ *//' | contains.py "2 "

fault.py "GET /api/2.0/permissions/pipelines/*" 504 0 2
trace $CLI bundle plan
print_requests.py //api/2.0/permissions/pipelines --get --oneline | uniq -c | sed 's/^ *//' | contains.py "3 "

fault.py "GET /api/2.0/permissions/pipelines/*" 504 0 3
trace musterr $CLI bundle plan
print_requests.py //api/2.0/permissions/pipelines --get --oneline | uniq -c | sed 's/^ *//' | contains.py "3 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"]
EnvMatrix.READPLAN = []
RecordRequests = true
Env.DATABRICKS_BUNDLE_RETRY_INTERVAL_MS = "100"
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bundle:
name: test-bundle

resources:
pipelines:
foo:
name: foo
permissions:
- level: CAN_VIEW
user_name: viewer@example.com

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

>>> [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

>>> [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files...
Deploying resources...
Warn: deploying resources.pipelines.foo.permissions: retrying after 504 Gateway Timeout from PUT /api/2.0/permissions/pipelines/[UUID]
Updating deployment state...
Deployment complete!

>>> print_requests.py //api/2.0/permissions/pipelines
"PUT /api/2.0/permissions/pipelines/[UUID]"
"PUT /api/2.0/permissions/pipelines/[UUID]"
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
trace $CLI bundle deploy

update_file.py databricks.yml CAN_VIEW CAN_MANAGE

# Inject a single 504 on the first permissions PUT to simulate a transient error.
# The retrying adapter should retry after DATABRICKS_BUNDLE_RETRY_INTERVAL_MS and succeed.
fault.py "PUT /api/2.0/permissions/pipelines/*" 504 0 1

rm out.requests.txt
trace $CLI bundle deploy

# Two PUT requests should appear: the initial 504 and the successful retry.
trace print_requests.py //api/2.0/permissions/pipelines | jq '.method + " " + .path'
48 changes: 39 additions & 9 deletions bundle/direct/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ import (
"reflect"

"github.com/databricks/cli/bundle/deployplan"
"github.com/databricks/cli/bundle/direct/dresources"
"github.com/databricks/cli/bundle/direct/dstate"
"github.com/databricks/cli/libs/log"
"github.com/databricks/databricks-sdk-go/apierr"
)

func (d *DeploymentUnit) Destroy(ctx context.Context, db *dstate.DeploymentState) error {
ctx = log.WithPrefix(ctx, "destroying "+d.ResourceKey)
id := db.GetResourceID(d.ResourceKey)
if id == "" {
log.Infof(ctx, "Cannot delete %s: missing from state", d.ResourceKey)
Expand All @@ -24,6 +26,7 @@ func (d *DeploymentUnit) Destroy(ctx context.Context, db *dstate.DeploymentState
}

func (d *DeploymentUnit) Deploy(ctx context.Context, db *dstate.DeploymentState, newState any, actionType deployplan.ActionType, planEntry *deployplan.PlanEntry) error {
ctx = log.WithPrefix(ctx, "deploying "+d.ResourceKey)
if actionType == deployplan.Create {
return d.Create(ctx, db, newState)
}
Expand All @@ -48,7 +51,18 @@ func (d *DeploymentUnit) Deploy(ctx context.Context, db *dstate.DeploymentState,
}

func (d *DeploymentUnit) Create(ctx context.Context, db *dstate.DeploymentState, newState any) error {
newID, remoteState, err := d.Adapter.DoCreate(ctx, newState)
var newID string
var remoteState any
_, err := retryWith(ctx, func(err error) bool {
// For DoCreate, retry feature is opt-in via retrySafe(err) error wrapper
_, ok := errors.AsType[*dresources.RetrySafeError](err)
Comment thread
denik marked this conversation as resolved.
Comment thread
denik marked this conversation as resolved.
return ok && isTransient(ctx, err)
}, func() (struct{}, error) {
var e error
newID, remoteState, e = d.Adapter.DoCreate(ctx, newState)
return struct{}{}, e
})
err = dresources.UnwrapRetrySafe(err)
if err != nil {
// No need to prefix error, there is no ambiguity (only one operation - DoCreate) and no additional context (like id)
return err
Expand All @@ -66,7 +80,9 @@ func (d *DeploymentUnit) Create(ctx context.Context, db *dstate.DeploymentState,
return fmt.Errorf("saving state after creating id=%s: %w", newID, err)
}

waitRemoteState, err := d.Adapter.WaitAfterCreate(ctx, newID, newState)
waitRemoteState, err := retryOnTransient(ctx, func() (any, error) {
return d.Adapter.WaitAfterCreate(ctx, newID, newState)
})
if err != nil {
return fmt.Errorf("waiting after creating id=%s: %w", newID, err)
}
Expand All @@ -89,7 +105,7 @@ func (d *DeploymentUnit) Recreate(ctx context.Context, db *dstate.DeploymentStat
// MANAGED_BY_PARENT is still disregarded — the subsequent Create with
// replace_existing=true will reconfigure the parent-managed resource in
// place, matching the Terraform provider's recreate behaviour.
err = d.Adapter.DoDelete(ctx, oldID, oldState)
err = retryOnTransientErr(ctx, func() error { return d.Adapter.DoDelete(ctx, oldID, oldState) })
if err != nil && !isResourceGone(err) && !isManagedByParent(err) {
return fmt.Errorf("deleting old id=%s: %w", oldID, err)
}
Expand Down Expand Up @@ -118,7 +134,9 @@ func (d *DeploymentUnit) Update(ctx context.Context, db *dstate.DeploymentState,
return fmt.Errorf("internal error: DoUpdate not implemented for resource %s", d.ResourceKey)
}

remoteState, err := d.Adapter.DoUpdate(ctx, id, newState, planEntry)
remoteState, err := retryOnTransient(ctx, func() (any, error) {
return d.Adapter.DoUpdate(ctx, id, newState, planEntry)
})
if err != nil {
return fmt.Errorf("updating id=%s: %w", id, err)
}
Expand All @@ -133,7 +151,9 @@ func (d *DeploymentUnit) Update(ctx context.Context, db *dstate.DeploymentState,
return fmt.Errorf("saving state id=%s: %w", id, err)
}

waitRemoteState, err := d.Adapter.WaitAfterUpdate(ctx, id, newState)
waitRemoteState, err := retryOnTransient(ctx, func() (any, error) {
return d.Adapter.WaitAfterUpdate(ctx, id, newState)
})
if err != nil {
return fmt.Errorf("waiting after updating id=%s: %w", id, err)
}
Expand All @@ -148,7 +168,13 @@ func (d *DeploymentUnit) Update(ctx context.Context, db *dstate.DeploymentState,
}

func (d *DeploymentUnit) UpdateWithID(ctx context.Context, db *dstate.DeploymentState, oldID string, newState any) error {
newID, remoteState, err := d.Adapter.DoUpdateWithID(ctx, oldID, newState)
var newID string
var remoteState any
err := retryOnTransientErr(ctx, func() error {
var e error
newID, remoteState, e = d.Adapter.DoUpdateWithID(ctx, oldID, newState)
return e
})
if err != nil {
return fmt.Errorf("updating id=%s: %w", oldID, err)
}
Expand All @@ -169,7 +195,9 @@ func (d *DeploymentUnit) UpdateWithID(ctx context.Context, db *dstate.Deployment
return fmt.Errorf("saving state id=%s: %w", oldID, err)
}

waitRemoteState, err := d.Adapter.WaitAfterUpdate(ctx, newID, newState)
waitRemoteState, err := retryOnTransient(ctx, func() (any, error) {
return d.Adapter.WaitAfterUpdate(ctx, newID, newState)
})
if err != nil {
return fmt.Errorf("waiting after updating id=%s: %w", newID, err)
}
Expand Down Expand Up @@ -219,7 +247,7 @@ func (d *DeploymentUnit) Delete(ctx context.Context, db *dstate.DeploymentState,
}

func (d *DeploymentUnit) Resize(ctx context.Context, db *dstate.DeploymentState, id string, newState any) error {
err := d.Adapter.DoResize(ctx, id, newState)
err := retryOnTransientErr(ctx, func() error { return d.Adapter.DoResize(ctx, id, newState) })
if err != nil {
return fmt.Errorf("resizing id=%s: %w", id, err)
}
Expand Down Expand Up @@ -263,7 +291,9 @@ func (d *DeploymentUnit) refreshRemoteState(ctx context.Context, id string) erro
if d.RemoteState != nil {
return nil
}
remoteState, err := d.Adapter.DoRead(ctx, id)
remoteState, err := retryOnTransient(ctx, func() (any, error) {
return d.Adapter.DoRead(ctx, id)
})
if err != nil {
return fmt.Errorf("failed to refresh remote state id=%s: %w", id, err)
}
Expand Down
9 changes: 7 additions & 2 deletions bundle/direct/bundle_plan.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks
// We're processing resources in DAG order because we're resolving references (that can be resolved at plan stage).
g.Run(defaultParallelism, func(resourceKey string, failedDependency *string) bool {
errorPrefix := "cannot plan " + resourceKey
ctx := log.WithPrefix(ctx, "planning "+resourceKey)

entry, err := plan.WriteLockEntry(resourceKey)
if err != nil {
Expand Down Expand Up @@ -155,7 +156,9 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks
return false
}

remoteState, err := adapter.DoRead(ctx, id)
remoteState, err := retryOnTransient(ctx, func() (any, error) {
return adapter.DoRead(ctx, id)
})
if err != nil {
if isResourceGone(err) {
// no such resource
Expand Down Expand Up @@ -210,7 +213,9 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks
return false
}

remoteState, err := adapter.DoRead(ctx, dbentry.ID)
remoteState, err := retryOnTransient(ctx, func() (any, error) {
return adapter.DoRead(ctx, dbentry.ID)
})
if err != nil {
if isResourceGone(err) {
remoteState = nil
Expand Down
3 changes: 2 additions & 1 deletion bundle/direct/dresources/grants.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ func (r *ResourceGrants) DoRead(ctx context.Context, id string) (*GrantsState, e
func (r *ResourceGrants) DoCreate(ctx context.Context, state *GrantsState) (string, *GrantsState, error) {
_, err := r.DoUpdate(ctx, "", state, nil)
if err != nil {
return "", nil, err
// Grants Update is idempotent (additive PATCH), so retrying on transient errors is safe.
return "", nil, retrySafe(err)
}

return state.SecurableType + "/" + state.FullName, nil, nil
Expand Down
3 changes: 2 additions & 1 deletion bundle/direct/dresources/permissions.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,8 @@ func (r *ResourcePermissions) DoCreate(ctx context.Context, newState *Permission
// should we remember the default here?
_, err := r.DoUpdate(ctx, newState.ObjectID, newState, nil)
if err != nil {
return "", nil, err
// Permissions Set is idempotent (PUT), so retrying on transient errors is safe.
return "", nil, retrySafe(err)
}

return newState.ObjectID, nil, nil
Expand Down
Loading
Loading