From c30bd0d46b8509d3f5cc5be8fef7fa6caa2969a6 Mon Sep 17 00:00:00 2001 From: Al Johri Date: Thu, 14 May 2026 14:51:36 -0400 Subject: [PATCH] fix(storage-opendal): add TimeoutLayer inside RetryLayer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current `create_operator` applies `RetryLayer` with no `TimeoutLayer`. Per opendal's docs, `TimeoutLayer` must be inside `RetryLayer`; without it, a future parked indefinitely on a silently dropped TCP connection never produces an `Err` and `RetryLayer` cannot retry — the caller hangs forever. We hit this in production: an iceberg `try_next()` Range-GET parked for 24h until a Kubernetes Job's `activeDeadlineSeconds` killed the pod. Core-dump analysis confirmed two in-flight Range-GETs sitting in heap with no live TCP connection, no error propagated, no retries attempted. This adds `TimeoutLayer::new()` with opendal's defaults (60s for non-IO ops; 10s per IO chunk) inside `RetryLayer`. Each retry attempt is now independently bounded; a hung connection times out, RetryLayer sees the error, and either succeeds on a subsequent attempt or propagates a clean error to the caller. Refs: #1288 (auto-closed by stale bot without engagement), #788 (the PR that added the unbounded RetryLayer). --- crates/storage/opendal/src/lib.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 65deaa5f44..ac07884ce4 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -39,7 +39,7 @@ use iceberg::io::{ }; use iceberg::{Error, ErrorKind, Result}; use opendal::Operator; -use opendal::layers::RetryLayer; +use opendal::layers::{RetryLayer, TimeoutLayer}; use serde::{Deserialize, Serialize}; use utils::from_opendal_error; @@ -326,9 +326,17 @@ impl OpenDalStorage { } }; - // Transient errors are common for object stores; however there's no - // harm in retrying temporary failures for other storage backends as well. - let operator = operator.layer(RetryLayer::new()); + // Apply observability/resilience layers. TimeoutLayer must be + // inside RetryLayer so each retry attempt is independently + // bounded — without a per-attempt timeout, a future parked on a + // silently dropped TCP connection never produces an `Err` and + // RetryLayer cannot retry, leaving the caller hung indefinitely. + // See: https://opendal.apache.org/docs/rust/opendal/layers/struct.TimeoutLayer.html + // + // Transient errors are common for object stores; we retry temporary + // failures with exponential backoff. The retry behavior also + // benefits non-object-store backends. + let operator = operator.layer(TimeoutLayer::new()).layer(RetryLayer::new()); Ok((operator, relative_path)) }