Skip to content

Commit 32f1ce8

Browse files
committed
filesystem: Fix crash when watch callback returns error or throws.
Previously, when a filesystem watch callback returned a non-OK status or threw an exception, the error would propagate to FileEventImpl which uses THROW_IF_NOT_OK. Since there's no exception handler in the libevent loop, this caused std::terminate to be called, crashing Envoy. This change makes the inotify (Linux) and kqueue (macOS) watchers handle callback errors gracefully by: 1. Catching exceptions using TRY_ASSERT_MAIN_THREAD/MULTI_CATCH macros. 2. Logging errors at WARN level instead of propagating them. 3. Always returning OkStatus to the event loop. This improves resilience during config/secret hot reloads where transient errors should not crash the proxy. Risk Level: Medium Testing: Added unit tests for error status and exception handling paths. Signed-off-by: Rohit Agrawal <rohit.agrawal@salesforce.com> Signed-off-by: Rohit Agrawal <rohit.agrawal@databricks.com>
1 parent 0d01005 commit 32f1ce8

File tree

7 files changed

+152
-9
lines changed

7 files changed

+152
-9
lines changed

source/common/filesystem/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,12 @@ envoy_cc_library(
120120
"//source/common/common:assert_lib",
121121
"//source/common/common:linked_object",
122122
"//source/common/common:minimal_logger_lib",
123+
"//source/common/common:thread_lib",
123124
"//source/common/common:utility_lib",
124125
"//source/common/network:default_socket_interface_lib",
125126
] + select({
126127
"//bazel:windows_x86_64": [
127128
"//source/common/api:os_sys_calls_lib",
128-
"//source/common/common:thread_lib",
129129
],
130130
"//conditions:default": [],
131131
}),

source/common/filesystem/inotify/watcher_impl.cc

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include "source/common/common/assert.h"
1212
#include "source/common/common/fmt.h"
13+
#include "source/common/common/thread.h"
1314
#include "source/common/common/utility.h"
1415
#include "source/common/filesystem/watcher_impl.h"
1516

@@ -52,6 +53,23 @@ absl::Status WatcherImpl::addWatch(absl::string_view path, uint32_t events, OnCh
5253
return absl::OkStatus();
5354
}
5455

56+
void WatcherImpl::callAndLogOnError(OnChangedCb& cb, uint32_t events, const std::string& file) {
57+
TRY_ASSERT_MAIN_THREAD {
58+
const absl::Status status = cb(events);
59+
if (!status.ok()) {
60+
ENVOY_LOG(warn, "Filesystem watch callback for '{}' returned error: {}", file,
61+
status.message());
62+
}
63+
}
64+
END_TRY
65+
MULTI_CATCH(
66+
const std::exception& e,
67+
{
68+
ENVOY_LOG(warn, "Filesystem watch callback for '{}' threw exception: {}", file, e.what());
69+
},
70+
{ ENVOY_LOG(warn, "Filesystem watch callback for '{}' threw unknown exception", file); });
71+
}
72+
5573
absl::Status WatcherImpl::onInotifyEvent() {
5674
while (true) {
5775
// The buffer needs to be suitably aligned to store the first inotify_event structure.
@@ -90,10 +108,10 @@ absl::Status WatcherImpl::onInotifyEvent() {
90108
if (watch.events_ & events) {
91109
if (watch.file_ == file) {
92110
ENVOY_LOG(debug, "matched callback: file: {}", file);
93-
RETURN_IF_NOT_OK(watch.cb_(events));
111+
callAndLogOnError(watch.cb_, events, file);
94112
} else if (watch.file_.empty()) {
95113
ENVOY_LOG(debug, "matched callback: directory: {}", file);
96-
RETURN_IF_NOT_OK(watch.cb_(events));
114+
callAndLogOnError(watch.cb_, events, file);
97115
}
98116
}
99117
}

source/common/filesystem/inotify/watcher_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class WatcherImpl : public Watcher, Logger::Loggable<Logger::Id::file> {
4040
};
4141

4242
absl::Status onInotifyEvent();
43+
void callAndLogOnError(OnChangedCb& cb, uint32_t events, const std::string& file);
4344

4445
Filesystem::Instance& file_system_;
4546
int inotify_fd_;

source/common/filesystem/kqueue/watcher_impl.cc

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "source/common/common/assert.h"
1010
#include "source/common/common/fmt.h"
11+
#include "source/common/common/thread.h"
1112
#include "source/common/common/utility.h"
1213
#include "source/common/filesystem/watcher_impl.h"
1314

@@ -116,20 +117,28 @@ absl::Status WatcherImpl::onKqueueEvent() {
116117

117118
absl::StatusOr<PathSplitResult> pathname_or_error =
118119
file_system_.splitPathFromFilename(file->file_);
119-
RETURN_IF_NOT_OK_REF(pathname_or_error.status());
120+
if (!pathname_or_error.ok()) {
121+
ENVOY_LOG(warn, "Failed to split path '{}': {}", file->file_,
122+
pathname_or_error.status().message());
123+
continue;
124+
}
120125
PathSplitResult& pathname = pathname_or_error.value();
121126

122127
if (file->watching_dir_) {
123128
if (event.fflags & NOTE_DELETE) {
124-
// directory was deleted
129+
// Directory was deleted.
125130
removeWatch(file);
126131
return absl::OkStatus();
127132
}
128133

129134
if (event.fflags & NOTE_WRITE) {
130-
// directory was written -- check if the file we're actually watching appeared
135+
// Directory was written -- check if the file we're actually watching appeared.
131136
auto file_or_error = addWatch(file->file_, file->events_, file->callback_, true);
132-
RETURN_IF_NOT_OK_REF(file_or_error.status());
137+
if (!file_or_error.ok()) {
138+
ENVOY_LOG(warn, "Failed to re-add watch for '{}': {}", file->file_,
139+
file_or_error.status().message());
140+
continue;
141+
}
133142
FileWatchPtr new_file = file_or_error.value();
134143
if (new_file != nullptr) {
135144
removeWatch(file);
@@ -150,7 +159,11 @@ absl::Status WatcherImpl::onKqueueEvent() {
150159
removeWatch(file);
151160

152161
auto file_or_error = addWatch(file->file_, file->events_, file->callback_, true);
153-
RETURN_IF_NOT_OK_REF(file_or_error.status());
162+
if (!file_or_error.ok()) {
163+
ENVOY_LOG(warn, "Failed to re-add watch for '{}': {}", file->file_,
164+
file_or_error.status().message());
165+
continue;
166+
}
154167
FileWatchPtr new_file = file_or_error.value();
155168
if (new_file == nullptr) {
156169
return absl::OkStatus();
@@ -173,11 +186,29 @@ absl::Status WatcherImpl::onKqueueEvent() {
173186

174187
if (events & file->events_) {
175188
ENVOY_LOG(debug, "matched callback: file: {}", file->file_);
176-
RETURN_IF_NOT_OK(file->callback_(events));
189+
callAndLogOnError(file->callback_, events, file->file_);
177190
}
178191
}
179192
return absl::OkStatus();
180193
}
181194

195+
void WatcherImpl::callAndLogOnError(Watcher::OnChangedCb& cb, uint32_t events,
196+
const std::string& file) {
197+
TRY_ASSERT_MAIN_THREAD {
198+
const absl::Status status = cb(events);
199+
if (!status.ok()) {
200+
ENVOY_LOG(warn, "Filesystem watch callback for '{}' returned error: {}", file,
201+
status.message());
202+
}
203+
}
204+
END_TRY
205+
MULTI_CATCH(
206+
const std::exception& e,
207+
{
208+
ENVOY_LOG(warn, "Filesystem watch callback for '{}' threw exception: {}", file, e.what());
209+
},
210+
{ ENVOY_LOG(warn, "Filesystem watch callback for '{}' threw unknown exception", file); });
211+
}
212+
182213
} // namespace Filesystem
183214
} // namespace Envoy

source/common/filesystem/kqueue/watcher_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class WatcherImpl : public Watcher, Logger::Loggable<Logger::Id::file> {
4646
absl::StatusOr<FileWatchPtr> addWatch(absl::string_view path, uint32_t events,
4747
Watcher::OnChangedCb cb, bool pathMustExist);
4848
void removeWatch(FileWatchPtr& watch);
49+
void callAndLogOnError(OnChangedCb& cb, uint32_t events, const std::string& file);
4950

5051
Filesystem::Instance& file_system_;
5152
int queue_;

test/common/filesystem/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@ envoy_cc_test(
3434
srcs = ["watcher_impl_test.cc"],
3535
rbe_pool = "6gig",
3636
deps = [
37+
"//envoy/common:exception_lib",
3738
"//source/common/common:assert_lib",
3839
"//source/common/event:dispatcher_includes",
3940
"//source/common/event:dispatcher_lib",
4041
"//source/common/filesystem:watcher_lib",
4142
"//test/test_common:environment_lib",
43+
"//test/test_common:logging_lib",
4244
],
4345
)

test/common/filesystem/watcher_impl_test.cc

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
#include <cstdint>
22
#include <fstream>
33

4+
#include "envoy/common/exception.h"
5+
46
#include "source/common/common/assert.h"
57
#include "source/common/event/dispatcher_impl.h"
68
#include "source/common/filesystem/watcher_impl.h"
79

810
#include "test/test_common/environment.h"
11+
#include "test/test_common/logging.h"
912
#include "test/test_common/utility.h"
1013

1114
#include "gmock/gmock.h"
@@ -214,5 +217,92 @@ TEST_F(WatcherImplTest, SymlinkAtomicRename) {
214217
}
215218
#endif
216219

220+
// Test that callback returning error status is logged and doesn't crash.
221+
TEST_F(WatcherImplTest, CallbackReturnsErrorStatus) {
222+
Filesystem::WatcherPtr watcher = dispatcher_->createFilesystemWatcher();
223+
224+
TestEnvironment::createPath(TestEnvironment::temporaryPath("envoy_test"));
225+
std::ofstream file(TestEnvironment::temporaryPath("envoy_test/watcher_target"));
226+
227+
WatchCallback callback;
228+
EXPECT_CALL(callback, called(Watcher::Events::Modified));
229+
ASSERT_TRUE(watcher
230+
->addWatch(TestEnvironment::temporaryPath("envoy_test/watcher_target"),
231+
Watcher::Events::Modified,
232+
[&](uint32_t events) {
233+
callback.called(events);
234+
dispatcher_->exit();
235+
// Return an error status - should be logged but not crash.
236+
return absl::InternalError("simulated callback error");
237+
})
238+
.ok());
239+
dispatcher_->run(Event::Dispatcher::RunType::NonBlock);
240+
241+
EXPECT_LOG_CONTAINS("warn", "Filesystem watch callback for", file << "text" << std::flush;
242+
file.close(); dispatcher_->run(Event::Dispatcher::RunType::Block););
243+
}
244+
245+
// Test that callback throwing exception is caught and logged.
246+
TEST_F(WatcherImplTest, CallbackThrowsException) {
247+
Filesystem::WatcherPtr watcher = dispatcher_->createFilesystemWatcher();
248+
249+
TestEnvironment::createPath(TestEnvironment::temporaryPath("envoy_test"));
250+
std::ofstream file(TestEnvironment::temporaryPath("envoy_test/watcher_target"));
251+
252+
WatchCallback callback;
253+
EXPECT_CALL(callback, called(Watcher::Events::Modified));
254+
ASSERT_TRUE(watcher
255+
->addWatch(TestEnvironment::temporaryPath("envoy_test/watcher_target"),
256+
Watcher::Events::Modified,
257+
[&](uint32_t events) -> absl::Status {
258+
callback.called(events);
259+
dispatcher_->exit();
260+
// Throw an exception - should be caught and logged.
261+
throw EnvoyException("simulated callback exception");
262+
})
263+
.ok());
264+
dispatcher_->run(Event::Dispatcher::RunType::NonBlock);
265+
266+
EXPECT_LOG_CONTAINS("warn", "threw exception", file << "text" << std::flush; file.close();
267+
dispatcher_->run(Event::Dispatcher::RunType::Block););
268+
}
269+
270+
// Test that multiple callbacks can fail without affecting each other.
271+
TEST_F(WatcherImplTest, MultipleCallbacksWithErrors) {
272+
Filesystem::WatcherPtr watcher = dispatcher_->createFilesystemWatcher();
273+
274+
TestEnvironment::createPath(TestEnvironment::temporaryPath("envoy_test"));
275+
std::ofstream file(TestEnvironment::temporaryPath("envoy_test/watcher_target"));
276+
277+
int callback_count = 0;
278+
ASSERT_TRUE(watcher
279+
->addWatch(TestEnvironment::temporaryPath("envoy_test/watcher_target"),
280+
Watcher::Events::Modified,
281+
[&](uint32_t) {
282+
callback_count++;
283+
if (callback_count >= 2) {
284+
dispatcher_->exit();
285+
}
286+
// First callback returns error, second returns OK.
287+
if (callback_count == 1) {
288+
return absl::InternalError("first callback error");
289+
}
290+
return absl::OkStatus();
291+
})
292+
.ok());
293+
dispatcher_->run(Event::Dispatcher::RunType::NonBlock);
294+
295+
// Trigger first modification. The first callback returns error, but watcher continues.
296+
file << "text1" << std::flush;
297+
dispatcher_->run(Event::Dispatcher::RunType::NonBlock);
298+
299+
// Trigger second modification. It should still work.
300+
file << "text2" << std::flush;
301+
file.close();
302+
dispatcher_->run(Event::Dispatcher::RunType::Block);
303+
304+
EXPECT_EQ(2, callback_count);
305+
}
306+
217307
} // namespace Filesystem
218308
} // namespace Envoy

0 commit comments

Comments
 (0)