diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 65deaa5f44..ac07884ce4 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -39,7 +39,7 @@ use iceberg::io::{ }; use iceberg::{Error, ErrorKind, Result}; use opendal::Operator; -use opendal::layers::RetryLayer; +use opendal::layers::{RetryLayer, TimeoutLayer}; use serde::{Deserialize, Serialize}; use utils::from_opendal_error; @@ -326,9 +326,17 @@ impl OpenDalStorage { } }; - // Transient errors are common for object stores; however there's no - // harm in retrying temporary failures for other storage backends as well. - let operator = operator.layer(RetryLayer::new()); + // Apply observability/resilience layers. TimeoutLayer must be + // inside RetryLayer so each retry attempt is independently + // bounded — without a per-attempt timeout, a future parked on a + // silently dropped TCP connection never produces an `Err` and + // RetryLayer cannot retry, leaving the caller hung indefinitely. + // See: https://opendal.apache.org/docs/rust/opendal/layers/struct.TimeoutLayer.html + // + // Transient errors are common for object stores; we retry temporary + // failures with exponential backoff. The retry behavior also + // benefits non-object-store backends. + let operator = operator.layer(TimeoutLayer::new()).layer(RetryLayer::new()); Ok((operator, relative_path)) }