diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 257624d3a0..cc3cc33db5 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -5,6 +5,7 @@ ### Notable Changes ### CLI +* Introduce retries to `databricks psql` command ([#3492](https://github.com/databricks/cli/pull/3492)) * Add rule files for coding agents working on the CLI code base ([#3245](https://github.com/databricks/cli/pull/3245)) ### Dependency updates diff --git a/acceptance/cmd/psql/failing-connection/always-fail.sh b/acceptance/cmd/psql/failing-connection/always-fail.sh new file mode 100755 index 0000000000..a7596688f6 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/always-fail.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# +# This script prints its arguments and exits. +# The test script renames this script to "psql" in order to capture the arguments that the CLI passes to psql command. +# +echo "Simulating connection failure with exit code '2'" +exit 2 diff --git a/acceptance/cmd/psql/failing-connection/out.test.toml b/acceptance/cmd/psql/failing-connection/out.test.toml new file mode 100644 index 0000000000..04844bff9e --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/out.test.toml @@ -0,0 +1,8 @@ +Local = true +Cloud = false + +[GOOS] + windows = false + +[EnvMatrix] + DATABRICKS_CLI_DEPLOYMENT = ["terraform", "direct-exp"] diff --git a/acceptance/cmd/psql/failing-connection/output.txt b/acceptance/cmd/psql/failing-connection/output.txt new file mode 100644 index 0000000000..5cd6350d78 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/output.txt @@ -0,0 +1,61 @@ + +=== Command should use default number of retries: +>>> musterr [CLI] psql my-database -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/3)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 1/3 failed, retrying in 1s... +Launching psql session to my-database.my-host.com (attempt 2/3)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 2/3 failed, retrying in 2s... +Launching psql session to my-database.my-host.com (attempt 3/3)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Error: failed to connect after 3 attempts, last error: connection failed (retryable): psql exited with code 2 + +Exit code (musterr): 1 + +=== Command should use custom number of retries: +>>> musterr [CLI] psql my-database --max-retries 5 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 1/5 failed, retrying in 1s... +Launching psql session to my-database.my-host.com (attempt 2/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 2/5 failed, retrying in 2s... +Launching psql session to my-database.my-host.com (attempt 3/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 3/5 failed, retrying in 4s... +Launching psql session to my-database.my-host.com (attempt 4/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 4/5 failed, retrying in 8s... +Launching psql session to my-database.my-host.com (attempt 5/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Error: failed to connect after 5 attempts, last error: connection failed (retryable): psql exited with code 2 + +Exit code (musterr): 1 + +=== Command should not use retries: +>>> musterr [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql with connection to my-database.my-host.com... +Simulating connection failure with exit code '2' + +Exit code (musterr): 2 diff --git a/acceptance/cmd/psql/failing-connection/script b/acceptance/cmd/psql/failing-connection/script new file mode 100644 index 0000000000..9a51fde95a --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/script @@ -0,0 +1,17 @@ +mv always-fail.sh psql + +cleanup() { + rm psql +} +trap cleanup EXIT + +export PATH="$(pwd):$PATH" + +title "Command should use default number of retries:" +trace musterr $CLI psql my-database -- --dbname=db1 -p 3000 + +title "Command should use custom number of retries:" +trace musterr $CLI psql my-database --max-retries 5 -- --dbname=db1 -p 3000 + +title "Command should not use retries:" +trace musterr $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000 diff --git a/acceptance/cmd/psql/failing-connection/test.toml b/acceptance/cmd/psql/failing-connection/test.toml new file mode 100644 index 0000000000..ebf13980a3 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/test.toml @@ -0,0 +1,32 @@ +# This acceptance test is disabled on Windows runners because +# the current argument capturing method does not work on windows-latest GitHub Runner. +# +# See PR #3228 for documented attempts to fix this issue: +# https://github.com/databricks/cli/pull/3228 +GOOS.windows = false + +[[Server]] +Pattern = "GET /api/2.0/database/instances/my-database" +Response.Body = ''' +{ + "state": "AVAILABLE", + "pg_version": "14", + "read_write_dns": "my-database.my-host.com" +} +''' + +[[Server]] +Pattern = "GET /api/2.0/database/instances" +Response.Body = ''' +{ + "database_instances": [] +} +''' + +[[Server]] +Pattern = "POST /api/2.0/database/credentials" +Response.Body = ''' +{ + "token": "my-secret-token" +} +''' diff --git a/acceptance/cmd/psql/simple/output.txt b/acceptance/cmd/psql/simple/output.txt index 9168442b1a..c588bf5a62 100644 --- a/acceptance/cmd/psql/simple/output.txt +++ b/acceptance/cmd/psql/simple/output.txt @@ -23,7 +23,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres PGPASSWORD=my-secret-token PGSSLMODE=require @@ -34,7 +34,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres -c SELECT * FROM my_table --echo-all PGPASSWORD=my-secret-token PGSSLMODE=require @@ -45,7 +45,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=db1 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -55,7 +55,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 -d db2 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -66,7 +66,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -76,6 +76,27 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/3)... +echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001 +PGPASSWORD=my-secret-token +PGSSLMODE=require + +=== Command should not use retries if max-retries is set to 0: +>>> [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql with connection to my-database.my-host.com... +echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000 +PGPASSWORD=my-secret-token +PGSSLMODE=require + +>>> [CLI] psql my-database --max-retries 0 -- -d db2 --port=3001 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials Launching psql with connection to my-database.my-host.com... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001 PGPASSWORD=my-secret-token diff --git a/acceptance/cmd/psql/simple/script b/acceptance/cmd/psql/simple/script index d662d35420..0caeef6f3e 100644 --- a/acceptance/cmd/psql/simple/script +++ b/acceptance/cmd/psql/simple/script @@ -29,3 +29,7 @@ trace $CLI psql my-database -- -d db2 title "Command should use the port from extra arguments when specified:" trace $CLI psql my-database -- --dbname=db1 -p 3000 trace $CLI psql my-database -- -d db2 --port=3001 + +title "Command should not use retries if max-retries is set to 0:" +trace $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +trace $CLI psql my-database --max-retries 0 -- -d db2 --port=3001 diff --git a/cmd/psql/psql.go b/cmd/psql/psql.go index 928784a50b..47ea14d596 100644 --- a/cmd/psql/psql.go +++ b/cmd/psql/psql.go @@ -3,6 +3,7 @@ package psql import ( "errors" "fmt" + "time" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/cmdio" @@ -26,12 +27,17 @@ func newLakebaseConnectCommand() *cobra.Command { This command requires a psql client to be installed on your machine for the connection to work. +The command includes automatic retry logic for connection failures. You can configure the retry behavior using the flags below. + You can pass additional arguments to psql after a double-dash (--): databricks psql my-database -- -c "SELECT * FROM my_table" databricks psql my-database -- --echo-all -d "my-db" `, } + // Add retry configuration flag + cmd.Flags().Int("max-retries", 3, "Maximum number of connection retry attempts (set to 0 to disable retries)") + cmd.PreRunE = root.MustWorkspaceClient cmd.RunE = func(cmd *cobra.Command, args []string) error { ctx := cmd.Context() @@ -74,7 +80,17 @@ You can pass additional arguments to psql after a double-dash (--): databaseInstanceName := args[0] extraArgs := args[1:] - return lakebase.Connect(cmd.Context(), databaseInstanceName, extraArgs...) + // Read retry configuration from flags + maxRetries, _ := cmd.Flags().GetInt("max-retries") + + retryConfig := lakebase.RetryConfig{ + MaxRetries: maxRetries, // Retries are disables when max-retries is 0 + InitialDelay: time.Second, // Fixed initial delay + MaxDelay: 10 * time.Second, // Fixed max delay + BackoffFactor: 2.0, // Fixed backoff factor + } + + return lakebase.ConnectWithRetryConfig(cmd.Context(), databaseInstanceName, retryConfig, extraArgs...) } cmd.ValidArgsFunction = func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 8f5536a2dc..832eae12dd 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -5,16 +5,53 @@ import ( "errors" "fmt" "os" + "os/exec" "strings" + "time" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/exec" + execlib "github.com/databricks/cli/libs/exec" "github.com/databricks/databricks-sdk-go/service/database" "github.com/google/uuid" ) -func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...string) error { +// RetryConfig holds configuration for connection retry behavior +type RetryConfig struct { + MaxRetries int + InitialDelay time.Duration + MaxDelay time.Duration + BackoffFactor float64 +} + +// attemptConnection launches psql interactively and returns an error if connection fails +func attemptConnection(ctx context.Context, args, env []string) error { + cmd := exec.CommandContext(ctx, args[0], args[1:]...) + cmd.Env = env + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + err := cmd.Run() + if err != nil { + // Check if the error might be due to connection issues + // Since we can't capture stderr when running interactively, we check the exit code + var exitError *exec.ExitError + if errors.As(err, &exitError) { + exitCode := exitError.ExitCode() + + // We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API + // psql returns exit code 2 for connection failures + if exitCode == 2 { + return fmt.Errorf("connection failed (retryable): psql exited with code %d", exitCode) + } + } + } + + return err +} + +func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig RetryConfig, extraArgs ...string) error { cmdio.LogString(ctx, fmt.Sprintf("Connecting to Databricks Database Instance %s ...", databaseInstanceName)) w := cmdctx.WorkspaceClient(ctx) @@ -91,11 +128,61 @@ func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...stri "PGSSLMODE=require", ) - cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) + // If retries are disabled, go directly to interactive session + if retryConfig.MaxRetries <= 0 { + cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) + return execlib.Execv(execlib.ExecvOptions{ + Args: args, + Env: cmdEnv, + }) + } - // Execute psql command inline - return exec.Execv(exec.ExecvOptions{ - Args: args, - Env: cmdEnv, - }) + // Try launching psql with retry logic + maxRetries := retryConfig.MaxRetries + delay := retryConfig.InitialDelay + + var lastErr error + for attempt := range maxRetries { + if attempt > 0 { + cmdio.LogString(ctx, fmt.Sprintf("Connection attempt %d/%d failed, retrying in %v...", attempt, maxRetries, delay)) + + // Wait with context cancellation support + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(delay): + } + + // Exponential backoff + delay = time.Duration(float64(delay) * retryConfig.BackoffFactor) + if delay > retryConfig.MaxDelay { + delay = retryConfig.MaxDelay + } + } + + cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries)) + + // Try to launch psql and capture the exit status + err := attemptConnection(ctx, args, cmdEnv) + if err == nil { + // psql exited normally (user quit) + return nil + } + + lastErr = err + + // Check if this is a retryable error + // We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API + if !strings.Contains(err.Error(), "connection failed (retryable)") { + // Non-retryable error, fail immediately + return err + } + + if attempt < maxRetries { + cmdio.LogString(ctx, fmt.Sprintf("Connection failed with retryable error: %v", err)) + } + } + + // All retries exhausted + return fmt.Errorf("failed to connect after %d attempts, last error: %w", maxRetries, lastErr) } diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go new file mode 100644 index 0000000000..b91a717522 --- /dev/null +++ b/libs/lakebase/connect_test.go @@ -0,0 +1,31 @@ +package lakebase + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTryPsqlInteractive(t *testing.T) { + ctx := context.Background() + + // Test successful execution (exit code 0) + args := []string{"echo", "success"} + var env []string + err := attemptConnection(ctx, args, env) + assert.NoError(t, err) + + // Test connection failure (exit code 2) - simulate with false command + args = []string{"sh", "-c", "exit 2"} + err = attemptConnection(ctx, args, env) + assert.Error(t, err) + assert.Contains(t, err.Error(), "connection failed (retryable)") + assert.Contains(t, err.Error(), "psql exited with code 2") + + // Test other failure (exit code 1) - should not be retryable + args = []string{"sh", "-c", "exit 1"} + err = attemptConnection(ctx, args, env) + assert.Error(t, err) + assert.NotContains(t, err.Error(), "connection failed (retryable)") +}