From 6467a9dec1f4e2e1f54fd78c275e279a8fc405a6 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Tue, 26 Aug 2025 14:51:54 +0200 Subject: [PATCH 01/12] introduce retries to `databricks psql` command --- cmd/psql/psql.go | 19 ++++- libs/lakebase/connect.go | 148 ++++++++++++++++++++++++++++++++-- libs/lakebase/connect_test.go | 113 ++++++++++++++++++++++++++ 3 files changed, 272 insertions(+), 8 deletions(-) create mode 100644 libs/lakebase/connect_test.go diff --git a/cmd/psql/psql.go b/cmd/psql/psql.go index 928784a50b..2be4a3e232 100644 --- a/cmd/psql/psql.go +++ b/cmd/psql/psql.go @@ -3,6 +3,7 @@ package psql import ( "errors" "fmt" + "time" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/cmdio" @@ -26,12 +27,17 @@ func newLakebaseConnectCommand() *cobra.Command { This command requires a psql client to be installed on your machine for the connection to work. +The command includes automatic retry logic for connection failures. You can configure the retry behavior using the flags below. + You can pass additional arguments to psql after a double-dash (--): databricks psql my-database -- -c "SELECT * FROM my_table" databricks psql my-database -- --echo-all -d "my-db" `, } + // Add retry configuration flag + cmd.Flags().Int("max-retries", 2, "Maximum number of connection retry attempts (set to 0 to disable retries)") + cmd.PreRunE = root.MustWorkspaceClient cmd.RunE = func(cmd *cobra.Command, args []string) error { ctx := cmd.Context() @@ -74,7 +80,18 @@ You can pass additional arguments to psql after a double-dash (--): databaseInstanceName := args[0] extraArgs := args[1:] - return lakebase.Connect(cmd.Context(), databaseInstanceName, extraArgs...) + // Read retry configuration from flags + maxRetries, _ := cmd.Flags().GetInt("max-retries") + + retryConfig := &lakebase.RetryConfig{ + MaxRetries: maxRetries, + InitialDelay: time.Second, // Fixed initial delay + MaxDelay: 10 * time.Second, // Fixed max delay + BackoffFactor: 2.0, // Fixed backoff factor + Enabled: maxRetries > 0, // Disable retries when max-retries is 0 + } + + return lakebase.ConnectWithRetryConfig(cmd.Context(), databaseInstanceName, retryConfig, extraArgs...) } cmd.ValidArgsFunction = func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 8f5536a2dc..851fe8a095 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -5,16 +5,90 @@ import ( "errors" "fmt" "os" + "os/exec" "strings" + "time" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/exec" + execlib "github.com/databricks/cli/libs/exec" "github.com/databricks/databricks-sdk-go/service/database" "github.com/google/uuid" ) +const ( + // Default retry configuration + defaultMaxRetries = 3 + defaultInitialDelay = 1 * time.Second + defaultMaxDelay = 10 * time.Second + defaultBackoffFactor = 2.0 +) + +// RetryConfig holds configuration for connection retry behavior +type RetryConfig struct { + MaxRetries int + InitialDelay time.Duration + MaxDelay time.Duration + BackoffFactor float64 + Enabled bool +} + +// isRetryableConnectionError checks if the error output indicates a retryable connection issue +func isRetryableConnectionError(output []byte) bool { + outputStr := string(output) + + // Check for specific error patterns that indicate connection issues + retryablePatterns := []string{ + "server closed the connection unexpectedly", + "connection to server", + "could not connect to server", + "Connection timed out", + "Connection refused", + "External authorization failed", + "This could be due to paused instances", + "blocked by Databricks IP ACL", + } + + for _, pattern := range retryablePatterns { + if strings.Contains(outputStr, pattern) { + return true + } + } + + return false +} + +// tryPsqlInteractive launches psql interactively and returns an error if connection fails +func tryPsqlInteractive(ctx context.Context, args, env []string) error { + cmd := exec.CommandContext(ctx, args[0], args[1:]...) + cmd.Env = env + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + err := cmd.Run() + if err != nil { + // Check if the error might be due to connection issues + // Since we can't capture stderr when running interactively, we check the exit code + var exitError *exec.ExitError + if errors.As(err, &exitError) { + exitCode := exitError.ExitCode() + // psql returns exit code 2 for connection failures + if exitCode == 2 { + return fmt.Errorf("connection failed (retryable): psql exited with code %d", exitCode) + } + } + return err + } + + return nil +} + func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...string) error { + return ConnectWithRetryConfig(ctx, databaseInstanceName, nil, extraArgs...) +} + +func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig *RetryConfig, extraArgs ...string) error { cmdio.LogString(ctx, fmt.Sprintf("Connecting to Databricks Database Instance %s ...", databaseInstanceName)) w := cmdctx.WorkspaceClient(ctx) @@ -91,11 +165,71 @@ func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...stri "PGSSLMODE=require", ) - cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) + // Use provided retry configuration or defaults + if retryConfig == nil { + retryConfig = &RetryConfig{ + MaxRetries: defaultMaxRetries, + InitialDelay: defaultInitialDelay, + MaxDelay: defaultMaxDelay, + BackoffFactor: defaultBackoffFactor, + Enabled: true, + } + } - // Execute psql command inline - return exec.Execv(exec.ExecvOptions{ - Args: args, - Env: cmdEnv, - }) + // If retries are disabled, go directly to interactive session + if !retryConfig.Enabled { + cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) + return execlib.Execv(execlib.ExecvOptions{ + Args: args, + Env: cmdEnv, + }) + } + + // Try launching psql with retry logic + maxRetries := retryConfig.MaxRetries + delay := retryConfig.InitialDelay + + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + cmdio.LogString(ctx, fmt.Sprintf("Connection attempt %d/%d failed, retrying in %v...", attempt, maxRetries+1, delay)) + + // Wait with context cancellation support + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(delay): + } + + // Exponential backoff + delay = time.Duration(float64(delay) * retryConfig.BackoffFactor) + if delay > retryConfig.MaxDelay { + delay = retryConfig.MaxDelay + } + } + + cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries+1)) + + // Try to launch psql and capture the exit status + err := tryPsqlInteractive(ctx, args, cmdEnv) + if err == nil { + // psql exited normally (user quit) + return nil + } + + lastErr = err + + // Check if this is a retryable error + if !strings.Contains(err.Error(), "connection failed (retryable)") { + // Non-retryable error, fail immediately + return err + } + + if attempt < maxRetries { + cmdio.LogString(ctx, fmt.Sprintf("Connection failed with retryable error: %v", err)) + } + } + + // All retries exhausted + return fmt.Errorf("failed to connect after %d attempts, last error: %w", maxRetries+1, lastErr) } diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go new file mode 100644 index 0000000000..10de68b36e --- /dev/null +++ b/libs/lakebase/connect_test.go @@ -0,0 +1,113 @@ +package lakebase + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsRetryableConnectionError(t *testing.T) { + tests := []struct { + name string + output string + expected bool + }{ + { + name: "server closed connection unexpectedly", + output: "psql: error: connection to server at \"instance-xyz.database.cloud.databricks.com\" (44.234.192.47), port 5432 failed: server closed the connection unexpectedly", + expected: true, + }, + { + name: "connection to server failed", + output: "psql: error: connection to server at \"instance-xyz.database.cloud.databricks.com\" failed", + expected: true, + }, + { + name: "external authorization failed", + output: "psql: error: External authorization failed. DETAIL: This could be due to paused instances", + expected: true, + }, + { + name: "blocked by IP ACL", + output: "psql: error: Source IP address: 108.224.88.11 is blocked by Databricks IP ACL for workspace: 12345678900987654321", + expected: true, + }, + { + name: "connection refused", + output: "psql: error: Connection refused", + expected: true, + }, + { + name: "connection timeout", + output: "psql: error: Connection timed out", + expected: true, + }, + { + name: "non-retryable error - syntax error", + output: "psql: error: syntax error at line 1", + expected: false, + }, + { + name: "non-retryable error - authentication failed", + output: "psql: FATAL: password authentication failed for user", + expected: false, + }, + { + name: "non-retryable error - database does not exist", + output: "psql: FATAL: database \"nonexistent\" does not exist", + expected: false, + }, + { + name: "empty output", + output: "", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isRetryableConnectionError([]byte(tt.output)) + assert.Equal(t, tt.expected, result, "Test case: %s", tt.name) + }) + } +} + +func TestRetryConfigDefaults(t *testing.T) { + config := &RetryConfig{ + MaxRetries: defaultMaxRetries, + InitialDelay: defaultInitialDelay, + MaxDelay: defaultMaxDelay, + BackoffFactor: defaultBackoffFactor, + Enabled: true, + } + + assert.Equal(t, 3, config.MaxRetries) + assert.Equal(t, "1s", config.InitialDelay.String()) + assert.Equal(t, "10s", config.MaxDelay.String()) + assert.InEpsilon(t, 2.0, config.BackoffFactor, 0.001) + assert.True(t, config.Enabled) +} + +func TestTryPsqlInteractive(t *testing.T) { + ctx := context.Background() + + // Test successful execution (exit code 0) + args := []string{"echo", "success"} + var env []string + err := tryPsqlInteractive(ctx, args, env) + assert.NoError(t, err) + + // Test connection failure (exit code 2) - simulate with false command + args = []string{"sh", "-c", "exit 2"} + err = tryPsqlInteractive(ctx, args, env) + assert.Error(t, err) + assert.Contains(t, err.Error(), "connection failed (retryable)") + assert.Contains(t, err.Error(), "psql exited with code 2") + + // Test other failure (exit code 1) - should not be retryable + args = []string{"sh", "-c", "exit 1"} + err = tryPsqlInteractive(ctx, args, env) + assert.Error(t, err) + assert.NotContains(t, err.Error(), "connection failed (retryable)") +} From d6e0008466b2b76833cebffbdb6df1f1033e6f47 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:02:28 +0200 Subject: [PATCH 02/12] add acceptance tests for retries in `databricks psql` command --- .../psql/failing-connection/always-fail.sh | 7 ++++ .../cmd/psql/failing-connection/out.test.toml | 8 +++++ .../cmd/psql/failing-connection/output.txt | 20 ++++++++++++ acceptance/cmd/psql/failing-connection/script | 11 +++++++ .../cmd/psql/failing-connection/test.toml | 32 +++++++++++++++++++ acceptance/cmd/psql/simple/output.txt | 31 +++++++++++++++--- acceptance/cmd/psql/simple/script | 4 +++ 7 files changed, 108 insertions(+), 5 deletions(-) create mode 100755 acceptance/cmd/psql/failing-connection/always-fail.sh create mode 100644 acceptance/cmd/psql/failing-connection/out.test.toml create mode 100644 acceptance/cmd/psql/failing-connection/output.txt create mode 100644 acceptance/cmd/psql/failing-connection/script create mode 100644 acceptance/cmd/psql/failing-connection/test.toml diff --git a/acceptance/cmd/psql/failing-connection/always-fail.sh b/acceptance/cmd/psql/failing-connection/always-fail.sh new file mode 100755 index 0000000000..a7596688f6 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/always-fail.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# +# This script prints its arguments and exits. +# The test script renames this script to "psql" in order to capture the arguments that the CLI passes to psql command. +# +echo "Simulating connection failure with exit code '2'" +exit 2 diff --git a/acceptance/cmd/psql/failing-connection/out.test.toml b/acceptance/cmd/psql/failing-connection/out.test.toml new file mode 100644 index 0000000000..04844bff9e --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/out.test.toml @@ -0,0 +1,8 @@ +Local = true +Cloud = false + +[GOOS] + windows = false + +[EnvMatrix] + DATABRICKS_CLI_DEPLOYMENT = ["terraform", "direct-exp"] diff --git a/acceptance/cmd/psql/failing-connection/output.txt b/acceptance/cmd/psql/failing-connection/output.txt new file mode 100644 index 0000000000..e8cead6863 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/output.txt @@ -0,0 +1,20 @@ + +=== Command should use default number of retries: +>>> musterr [CLI] psql my-database -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/3)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 1/3 failed, retrying in 1s... +Launching psql session to my-database.my-host.com (attempt 2/3)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 2/3 failed, retrying in 2s... +Launching psql session to my-database.my-host.com (attempt 3/3)... +Simulating connection failure with exit code '2' +Error: failed to connect after 3 attempts, last error: connection failed (retryable): psql exited with code 2 + +Exit code (musterr): 1 diff --git a/acceptance/cmd/psql/failing-connection/script b/acceptance/cmd/psql/failing-connection/script new file mode 100644 index 0000000000..b79ea48a31 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/script @@ -0,0 +1,11 @@ +mv always-fail.sh psql + +cleanup() { + rm psql +} +trap cleanup EXIT + +export PATH="$(pwd):$PATH" + +title "Command should use default number of retries:" +trace musterr $CLI psql my-database -- --dbname=db1 -p 3000 diff --git a/acceptance/cmd/psql/failing-connection/test.toml b/acceptance/cmd/psql/failing-connection/test.toml new file mode 100644 index 0000000000..ebf13980a3 --- /dev/null +++ b/acceptance/cmd/psql/failing-connection/test.toml @@ -0,0 +1,32 @@ +# This acceptance test is disabled on Windows runners because +# the current argument capturing method does not work on windows-latest GitHub Runner. +# +# See PR #3228 for documented attempts to fix this issue: +# https://github.com/databricks/cli/pull/3228 +GOOS.windows = false + +[[Server]] +Pattern = "GET /api/2.0/database/instances/my-database" +Response.Body = ''' +{ + "state": "AVAILABLE", + "pg_version": "14", + "read_write_dns": "my-database.my-host.com" +} +''' + +[[Server]] +Pattern = "GET /api/2.0/database/instances" +Response.Body = ''' +{ + "database_instances": [] +} +''' + +[[Server]] +Pattern = "POST /api/2.0/database/credentials" +Response.Body = ''' +{ + "token": "my-secret-token" +} +''' diff --git a/acceptance/cmd/psql/simple/output.txt b/acceptance/cmd/psql/simple/output.txt index 9168442b1a..c588bf5a62 100644 --- a/acceptance/cmd/psql/simple/output.txt +++ b/acceptance/cmd/psql/simple/output.txt @@ -23,7 +23,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres PGPASSWORD=my-secret-token PGSSLMODE=require @@ -34,7 +34,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres -c SELECT * FROM my_table --echo-all PGPASSWORD=my-secret-token PGSSLMODE=require @@ -45,7 +45,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=db1 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -55,7 +55,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 -d db2 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -66,7 +66,7 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials -Launching psql with connection to my-database.my-host.com... +Launching psql session to my-database.my-host.com (attempt 1/3)... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000 PGPASSWORD=my-secret-token PGSSLMODE=require @@ -76,6 +76,27 @@ Connecting to Databricks Database Instance my-database ... Postgres version: 14 Database instance status: AVAILABLE Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/3)... +echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001 +PGPASSWORD=my-secret-token +PGSSLMODE=require + +=== Command should not use retries if max-retries is set to 0: +>>> [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql with connection to my-database.my-host.com... +echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000 +PGPASSWORD=my-secret-token +PGSSLMODE=require + +>>> [CLI] psql my-database --max-retries 0 -- -d db2 --port=3001 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials Launching psql with connection to my-database.my-host.com... echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001 PGPASSWORD=my-secret-token diff --git a/acceptance/cmd/psql/simple/script b/acceptance/cmd/psql/simple/script index d662d35420..0caeef6f3e 100644 --- a/acceptance/cmd/psql/simple/script +++ b/acceptance/cmd/psql/simple/script @@ -29,3 +29,7 @@ trace $CLI psql my-database -- -d db2 title "Command should use the port from extra arguments when specified:" trace $CLI psql my-database -- --dbname=db1 -p 3000 trace $CLI psql my-database -- -d db2 --port=3001 + +title "Command should not use retries if max-retries is set to 0:" +trace $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +trace $CLI psql my-database --max-retries 0 -- -d db2 --port=3001 From 72fb5ae31b8f752a7ca496051ee4c634b082d59c Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:15:51 +0200 Subject: [PATCH 03/12] update NEXT_CHANGELOG.md --- NEXT_CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index d6b5dde4f9..ab25ef5156 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -11,6 +11,7 @@ See more details here: ([#3225](https://github.com/databricks/cli/pull/3225)) * Add support volumes in Python support ([#3383])(https://github.com/databricks/cli/pull/3383)) ### CLI +* Introduce retries to `databricks psql` command ([#3492](https://github.com/databricks/cli/pull/3492)) ### Dependency updates From 4912b2b71c430b52dda952b1c37d4a3e137c8739 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 11:29:09 +0200 Subject: [PATCH 04/12] refactor: remove the `Enabled` field --- cmd/psql/psql.go | 3 +-- libs/lakebase/connect.go | 4 +--- libs/lakebase/connect_test.go | 2 -- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cmd/psql/psql.go b/cmd/psql/psql.go index 2be4a3e232..e514d2396a 100644 --- a/cmd/psql/psql.go +++ b/cmd/psql/psql.go @@ -84,11 +84,10 @@ You can pass additional arguments to psql after a double-dash (--): maxRetries, _ := cmd.Flags().GetInt("max-retries") retryConfig := &lakebase.RetryConfig{ - MaxRetries: maxRetries, + MaxRetries: maxRetries, // Retries are disables when max-retries is 0 InitialDelay: time.Second, // Fixed initial delay MaxDelay: 10 * time.Second, // Fixed max delay BackoffFactor: 2.0, // Fixed backoff factor - Enabled: maxRetries > 0, // Disable retries when max-retries is 0 } return lakebase.ConnectWithRetryConfig(cmd.Context(), databaseInstanceName, retryConfig, extraArgs...) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 851fe8a095..93586cb95f 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -30,7 +30,6 @@ type RetryConfig struct { InitialDelay time.Duration MaxDelay time.Duration BackoffFactor float64 - Enabled bool } // isRetryableConnectionError checks if the error output indicates a retryable connection issue @@ -172,12 +171,11 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re InitialDelay: defaultInitialDelay, MaxDelay: defaultMaxDelay, BackoffFactor: defaultBackoffFactor, - Enabled: true, } } // If retries are disabled, go directly to interactive session - if !retryConfig.Enabled { + if retryConfig.MaxRetries <= 0 { cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) return execlib.Execv(execlib.ExecvOptions{ Args: args, diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go index 10de68b36e..d8073221a8 100644 --- a/libs/lakebase/connect_test.go +++ b/libs/lakebase/connect_test.go @@ -79,14 +79,12 @@ func TestRetryConfigDefaults(t *testing.T) { InitialDelay: defaultInitialDelay, MaxDelay: defaultMaxDelay, BackoffFactor: defaultBackoffFactor, - Enabled: true, } assert.Equal(t, 3, config.MaxRetries) assert.Equal(t, "1s", config.InitialDelay.String()) assert.Equal(t, "10s", config.MaxDelay.String()) assert.InEpsilon(t, 2.0, config.BackoffFactor, 0.001) - assert.True(t, config.Enabled) } func TestTryPsqlInteractive(t *testing.T) { From 66d52d5baf62bb654dd511c65ae52e5b58e5f80e Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 11:32:11 +0200 Subject: [PATCH 05/12] refactor: remove the `isRetryableConnectionError` method --- libs/lakebase/connect.go | 25 ------------- libs/lakebase/connect_test.go | 66 ----------------------------------- 2 files changed, 91 deletions(-) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 93586cb95f..2d1723e204 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -32,31 +32,6 @@ type RetryConfig struct { BackoffFactor float64 } -// isRetryableConnectionError checks if the error output indicates a retryable connection issue -func isRetryableConnectionError(output []byte) bool { - outputStr := string(output) - - // Check for specific error patterns that indicate connection issues - retryablePatterns := []string{ - "server closed the connection unexpectedly", - "connection to server", - "could not connect to server", - "Connection timed out", - "Connection refused", - "External authorization failed", - "This could be due to paused instances", - "blocked by Databricks IP ACL", - } - - for _, pattern := range retryablePatterns { - if strings.Contains(outputStr, pattern) { - return true - } - } - - return false -} - // tryPsqlInteractive launches psql interactively and returns an error if connection fails func tryPsqlInteractive(ctx context.Context, args, env []string) error { cmd := exec.CommandContext(ctx, args[0], args[1:]...) diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go index d8073221a8..8dbea7f94b 100644 --- a/libs/lakebase/connect_test.go +++ b/libs/lakebase/connect_test.go @@ -7,72 +7,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestIsRetryableConnectionError(t *testing.T) { - tests := []struct { - name string - output string - expected bool - }{ - { - name: "server closed connection unexpectedly", - output: "psql: error: connection to server at \"instance-xyz.database.cloud.databricks.com\" (44.234.192.47), port 5432 failed: server closed the connection unexpectedly", - expected: true, - }, - { - name: "connection to server failed", - output: "psql: error: connection to server at \"instance-xyz.database.cloud.databricks.com\" failed", - expected: true, - }, - { - name: "external authorization failed", - output: "psql: error: External authorization failed. DETAIL: This could be due to paused instances", - expected: true, - }, - { - name: "blocked by IP ACL", - output: "psql: error: Source IP address: 108.224.88.11 is blocked by Databricks IP ACL for workspace: 12345678900987654321", - expected: true, - }, - { - name: "connection refused", - output: "psql: error: Connection refused", - expected: true, - }, - { - name: "connection timeout", - output: "psql: error: Connection timed out", - expected: true, - }, - { - name: "non-retryable error - syntax error", - output: "psql: error: syntax error at line 1", - expected: false, - }, - { - name: "non-retryable error - authentication failed", - output: "psql: FATAL: password authentication failed for user", - expected: false, - }, - { - name: "non-retryable error - database does not exist", - output: "psql: FATAL: database \"nonexistent\" does not exist", - expected: false, - }, - { - name: "empty output", - output: "", - expected: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := isRetryableConnectionError([]byte(tt.output)) - assert.Equal(t, tt.expected, result, "Test case: %s", tt.name) - }) - } -} - func TestRetryConfigDefaults(t *testing.T) { config := &RetryConfig{ MaxRetries: defaultMaxRetries, From 7dbc8ba8d0a2a04d6e3c1a13b7b00a1f70b5e306 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 11:39:57 +0200 Subject: [PATCH 06/12] add a comment on retryable errors and why we are not using the SDK for them --- libs/lakebase/connect.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 2d1723e204..9d31354cf6 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -47,6 +47,8 @@ func tryPsqlInteractive(ctx context.Context, args, env []string) error { var exitError *exec.ExitError if errors.As(err, &exitError) { exitCode := exitError.ExitCode() + + // We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API // psql returns exit code 2 for connection failures if exitCode == 2 { return fmt.Errorf("connection failed (retryable): psql exited with code %d", exitCode) @@ -193,6 +195,7 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re lastErr = err // Check if this is a retryable error + // We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API if !strings.Contains(err.Error(), "connection failed (retryable)") { // Non-retryable error, fail immediately return err From d9cbf899e5d5f7d7b9bc72414d3bf50d5d3f4e7d Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:34:44 +0200 Subject: [PATCH 07/12] refactor: remove constants with defaults; remove superfluous unit test --- libs/lakebase/connect.go | 24 +----------------------- libs/lakebase/connect_test.go | 14 -------------- 2 files changed, 1 insertion(+), 37 deletions(-) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 9d31354cf6..51c262746d 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -16,14 +16,6 @@ import ( "github.com/google/uuid" ) -const ( - // Default retry configuration - defaultMaxRetries = 3 - defaultInitialDelay = 1 * time.Second - defaultMaxDelay = 10 * time.Second - defaultBackoffFactor = 2.0 -) - // RetryConfig holds configuration for connection retry behavior type RetryConfig struct { MaxRetries int @@ -60,11 +52,7 @@ func tryPsqlInteractive(ctx context.Context, args, env []string) error { return nil } -func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...string) error { - return ConnectWithRetryConfig(ctx, databaseInstanceName, nil, extraArgs...) -} - -func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig *RetryConfig, extraArgs ...string) error { +func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig RetryConfig, extraArgs ...string) error { cmdio.LogString(ctx, fmt.Sprintf("Connecting to Databricks Database Instance %s ...", databaseInstanceName)) w := cmdctx.WorkspaceClient(ctx) @@ -141,16 +129,6 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re "PGSSLMODE=require", ) - // Use provided retry configuration or defaults - if retryConfig == nil { - retryConfig = &RetryConfig{ - MaxRetries: defaultMaxRetries, - InitialDelay: defaultInitialDelay, - MaxDelay: defaultMaxDelay, - BackoffFactor: defaultBackoffFactor, - } - } - // If retries are disabled, go directly to interactive session if retryConfig.MaxRetries <= 0 { cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns)) diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go index 8dbea7f94b..46fc87f0d8 100644 --- a/libs/lakebase/connect_test.go +++ b/libs/lakebase/connect_test.go @@ -7,20 +7,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestRetryConfigDefaults(t *testing.T) { - config := &RetryConfig{ - MaxRetries: defaultMaxRetries, - InitialDelay: defaultInitialDelay, - MaxDelay: defaultMaxDelay, - BackoffFactor: defaultBackoffFactor, - } - - assert.Equal(t, 3, config.MaxRetries) - assert.Equal(t, "1s", config.InitialDelay.String()) - assert.Equal(t, "10s", config.MaxDelay.String()) - assert.InEpsilon(t, 2.0, config.BackoffFactor, 0.001) -} - func TestTryPsqlInteractive(t *testing.T) { ctx := context.Background() From b2c814008d8f838552194bbecb266a83e122056b Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:35:36 +0200 Subject: [PATCH 08/12] refactor: RetryConfig is a struct, not a pointer --- cmd/psql/psql.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/psql/psql.go b/cmd/psql/psql.go index e514d2396a..a48d76725a 100644 --- a/cmd/psql/psql.go +++ b/cmd/psql/psql.go @@ -83,7 +83,7 @@ You can pass additional arguments to psql after a double-dash (--): // Read retry configuration from flags maxRetries, _ := cmd.Flags().GetInt("max-retries") - retryConfig := &lakebase.RetryConfig{ + retryConfig := lakebase.RetryConfig{ MaxRetries: maxRetries, // Retries are disables when max-retries is 0 InitialDelay: time.Second, // Fixed initial delay MaxDelay: 10 * time.Second, // Fixed max delay From 1bebd21aba766c420b65e82c69ddd60b963ef6cc Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:40:28 +0200 Subject: [PATCH 09/12] rename `tryPsqlInteractive` to `attemptConnection` --- libs/lakebase/connect.go | 6 +++--- libs/lakebase/connect_test.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 51c262746d..5519b0761e 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -24,8 +24,8 @@ type RetryConfig struct { BackoffFactor float64 } -// tryPsqlInteractive launches psql interactively and returns an error if connection fails -func tryPsqlInteractive(ctx context.Context, args, env []string) error { +// attemptConnection launches psql interactively and returns an error if connection fails +func attemptConnection(ctx context.Context, args, env []string) error { cmd := exec.CommandContext(ctx, args[0], args[1:]...) cmd.Env = env cmd.Stdin = os.Stdin @@ -164,7 +164,7 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries+1)) // Try to launch psql and capture the exit status - err := tryPsqlInteractive(ctx, args, cmdEnv) + err := attemptConnection(ctx, args, cmdEnv) if err == nil { // psql exited normally (user quit) return nil diff --git a/libs/lakebase/connect_test.go b/libs/lakebase/connect_test.go index 46fc87f0d8..b91a717522 100644 --- a/libs/lakebase/connect_test.go +++ b/libs/lakebase/connect_test.go @@ -13,19 +13,19 @@ func TestTryPsqlInteractive(t *testing.T) { // Test successful execution (exit code 0) args := []string{"echo", "success"} var env []string - err := tryPsqlInteractive(ctx, args, env) + err := attemptConnection(ctx, args, env) assert.NoError(t, err) // Test connection failure (exit code 2) - simulate with false command args = []string{"sh", "-c", "exit 2"} - err = tryPsqlInteractive(ctx, args, env) + err = attemptConnection(ctx, args, env) assert.Error(t, err) assert.Contains(t, err.Error(), "connection failed (retryable)") assert.Contains(t, err.Error(), "psql exited with code 2") // Test other failure (exit code 1) - should not be retryable args = []string{"sh", "-c", "exit 1"} - err = tryPsqlInteractive(ctx, args, env) + err = attemptConnection(ctx, args, env) assert.Error(t, err) assert.NotContains(t, err.Error(), "connection failed (retryable)") } From e877c452df0c55f48a5c50d42e65247ff6004238 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:41:48 +0200 Subject: [PATCH 10/12] minor refactoring --- libs/lakebase/connect.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index 5519b0761e..c4ec710874 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -33,6 +33,7 @@ func attemptConnection(ctx context.Context, args, env []string) error { cmd.Stderr = os.Stderr err := cmd.Run() + if err != nil { // Check if the error might be due to connection issues // Since we can't capture stderr when running interactively, we check the exit code @@ -46,10 +47,9 @@ func attemptConnection(ctx context.Context, args, env []string) error { return fmt.Errorf("connection failed (retryable): psql exited with code %d", exitCode) } } - return err } - return nil + return err } func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig RetryConfig, extraArgs ...string) error { From a00df4e4f85312dd0491709eb940d32d18068977 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:42:12 +0200 Subject: [PATCH 11/12] make lint --- libs/lakebase/connect.go | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index c4ec710874..d16fb66ee5 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -33,7 +33,6 @@ func attemptConnection(ctx context.Context, args, env []string) error { cmd.Stderr = os.Stderr err := cmd.Run() - if err != nil { // Check if the error might be due to connection issues // Since we can't capture stderr when running interactively, we check the exit code From 3ea432804287521517385be05d4f6d0c3988c2cd Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:54:31 +0200 Subject: [PATCH 12/12] fix off-by-one error with custom number of retries --- .../cmd/psql/failing-connection/output.txt | 41 +++++++++++++++++++ acceptance/cmd/psql/failing-connection/script | 6 +++ cmd/psql/psql.go | 2 +- libs/lakebase/connect.go | 8 ++-- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/acceptance/cmd/psql/failing-connection/output.txt b/acceptance/cmd/psql/failing-connection/output.txt index e8cead6863..5cd6350d78 100644 --- a/acceptance/cmd/psql/failing-connection/output.txt +++ b/acceptance/cmd/psql/failing-connection/output.txt @@ -15,6 +15,47 @@ Connection failed with retryable error: connection failed (retryable): psql exit Connection attempt 2/3 failed, retrying in 2s... Launching psql session to my-database.my-host.com (attempt 3/3)... Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 Error: failed to connect after 3 attempts, last error: connection failed (retryable): psql exited with code 2 Exit code (musterr): 1 + +=== Command should use custom number of retries: +>>> musterr [CLI] psql my-database --max-retries 5 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql session to my-database.my-host.com (attempt 1/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 1/5 failed, retrying in 1s... +Launching psql session to my-database.my-host.com (attempt 2/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 2/5 failed, retrying in 2s... +Launching psql session to my-database.my-host.com (attempt 3/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 3/5 failed, retrying in 4s... +Launching psql session to my-database.my-host.com (attempt 4/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Connection attempt 4/5 failed, retrying in 8s... +Launching psql session to my-database.my-host.com (attempt 5/5)... +Simulating connection failure with exit code '2' +Connection failed with retryable error: connection failed (retryable): psql exited with code 2 +Error: failed to connect after 5 attempts, last error: connection failed (retryable): psql exited with code 2 + +Exit code (musterr): 1 + +=== Command should not use retries: +>>> musterr [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000 +Connecting to Databricks Database Instance my-database ... +Postgres version: 14 +Database instance status: AVAILABLE +Successfully fetched database credentials +Launching psql with connection to my-database.my-host.com... +Simulating connection failure with exit code '2' + +Exit code (musterr): 2 diff --git a/acceptance/cmd/psql/failing-connection/script b/acceptance/cmd/psql/failing-connection/script index b79ea48a31..9a51fde95a 100644 --- a/acceptance/cmd/psql/failing-connection/script +++ b/acceptance/cmd/psql/failing-connection/script @@ -9,3 +9,9 @@ export PATH="$(pwd):$PATH" title "Command should use default number of retries:" trace musterr $CLI psql my-database -- --dbname=db1 -p 3000 + +title "Command should use custom number of retries:" +trace musterr $CLI psql my-database --max-retries 5 -- --dbname=db1 -p 3000 + +title "Command should not use retries:" +trace musterr $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000 diff --git a/cmd/psql/psql.go b/cmd/psql/psql.go index a48d76725a..47ea14d596 100644 --- a/cmd/psql/psql.go +++ b/cmd/psql/psql.go @@ -36,7 +36,7 @@ You can pass additional arguments to psql after a double-dash (--): } // Add retry configuration flag - cmd.Flags().Int("max-retries", 2, "Maximum number of connection retry attempts (set to 0 to disable retries)") + cmd.Flags().Int("max-retries", 3, "Maximum number of connection retry attempts (set to 0 to disable retries)") cmd.PreRunE = root.MustWorkspaceClient cmd.RunE = func(cmd *cobra.Command, args []string) error { diff --git a/libs/lakebase/connect.go b/libs/lakebase/connect.go index d16fb66ee5..832eae12dd 100644 --- a/libs/lakebase/connect.go +++ b/libs/lakebase/connect.go @@ -142,9 +142,9 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re delay := retryConfig.InitialDelay var lastErr error - for attempt := 0; attempt <= maxRetries; attempt++ { + for attempt := range maxRetries { if attempt > 0 { - cmdio.LogString(ctx, fmt.Sprintf("Connection attempt %d/%d failed, retrying in %v...", attempt, maxRetries+1, delay)) + cmdio.LogString(ctx, fmt.Sprintf("Connection attempt %d/%d failed, retrying in %v...", attempt, maxRetries, delay)) // Wait with context cancellation support select { @@ -160,7 +160,7 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re } } - cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries+1)) + cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries)) // Try to launch psql and capture the exit status err := attemptConnection(ctx, args, cmdEnv) @@ -184,5 +184,5 @@ func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, re } // All retries exhausted - return fmt.Errorf("failed to connect after %d attempts, last error: %w", maxRetries+1, lastErr) + return fmt.Errorf("failed to connect after %d attempts, last error: %w", maxRetries, lastErr) }