Skip to content

sled agent hung #9569

@rcgoodfellow

Description

@rcgoodfellow

I've had sled agent hang in several a4x2 runs recently. When this happens sled-agent hangs on curl requests to any URL, it does not even return a 404 for URLs that do not exist it just hangs. Logging stops completely.

This happens during rack initialization. Typically 3 of 4 nodes will come all the way up with the sled agent on one node locking up.

Here are some various things we looked at while debugging.

Ptree

 root@g1:~# ptree 1308
1305   ctrun -l child -o noorphan,regent /opt/oxide/sled-agent/sled-agent run /opt/oxi
  1308   /opt/oxide/sled-agent/sled-agent run /opt/oxide/sled-agent/pkg/config.toml
    4665   <defunct>
    4667   <defunct>
    4663   <defunct>
    4668   <defunct>
    4662   <defunct>
    4666   <defunct>
    4664   <defunct>
    4688   <defunct>
    1383   /opt/oxide/sled-agent/tofino-monitor

Zones

root@g1:~# zoneadm list -ivc
  ID NAME             STATUS     PATH                           BRAND    IP
   0 global           running    /                              ipkg     shared
   1 oxz_internal_dns_0b1ea169-bb1d-4094-aab6-25c69317d8c0 running    /pool/ext/ed370813-4c5e-4f3f-8ae7-4b39993c6805/crypt/zone/oxz_internal_dns_0b1ea169-bb1d-4094-aab6-25c69317d8c0 omicron1 excl
   2 oxz_ntp_939c6cde-c207-416b-8c69-188213debc05 running    /pool/ext/2d5c6f59-6c27-4b62-97c3-b4fdab893f0e/crypt/zone/oxz_ntp_939c6cde-c207-416b-8c69-188213debc05 omicron1 excl
   3 oxz_cockroachdb_db60d835-8068-43a3-875c-98cc24c28c4a running    /pool/ext/ed370813-4c5e-4f3f-8ae7-4b39993c6805/crypt/zone/oxz_cockroachdb_db60d835-8068-43a3-875c-98cc24c28c4a omicron1 excl
   - oxz_crucible_pantry_1600fba1-ecc0-48ff-9f45-5b2be93da34d installed  /pool/ext/8ec0a093-a401-4e0d-8877-14d39b2217de/crypt/zone/oxz_crucible_pantry_1600fba1-ecc0-48ff-9f45-5b2be93da34d omicron1 excl
   - oxz_crucible_0a84f712-beb1-43c3-9e9b-2f2375fef895 installed  /pool/ext/2d5c6f59-6c27-4b62-97c3-b4fdab893f0e/crypt/zone/oxz_crucible_0a84f712-beb1-43c3-9e9b-2f2375fef895 omicron1 excl
   - oxz_crucible_6edae047-c9e2-4ca6-992f-64529f2bb072 installed  /pool/ext/ed370813-4c5e-4f3f-8ae7-4b39993c6805/crypt/zone/oxz_crucible_6edae047-c9e2-4ca6-992f-64529f2bb072 omicron1 excl
   - oxz_crucible_60579e90-efa7-4e29-bc0c-02a5c1faa5ae installed  /pool/ext/8ec0a093-a401-4e0d-8877-14d39b2217de/crypt/zone/oxz_crucible_60579e90-efa7-4e29-bc0c-02a5c1faa5ae omicron1 excl
   - oxz_oximeter_20fb02e7-7f08-4e72-8270-0e217d789b35 installed  /pool/ext/8ec0a093-a401-4e0d-8877-14d39b2217de/crypt/zone/oxz_oximeter_20fb02e7-7f08-4e72-8270-0e217d789b35 omicron1 excl
   - oxz_crucible_b61c1a8c-3edf-4207-933d-8a543a6347b8 installed  /pool/ext/6c967709-685c-4dfe-9f26-9b3b5d6222c5/crypt/zone/oxz_crucible_b61c1a8c-3edf-4207-933d-8a543a6347b8 omicron1 excl
   - oxz_crucible_6fe2c6b5-13b4-4823-bbef-a9820d87365d installed  /pool/ext/5d9cbbb6-ba0f-4451-a614-92ed9e7e8528/crypt/zone/oxz_crucible_6fe2c6b5-13b4-4823-bbef-a9820d87365d omicron1 excl
   - oxz_external_dns_ff90c87f-8726-45bd-af30-3d8cc4e066fa installed  /pool/ext/ed370813-4c5e-4f3f-8ae7-4b39993c6805/crypt/zone/oxz_external_dns_ff90c87f-8726-45bd-af30-3d8cc4e066fa omicron1 excl

Pstack

Notably thread #9 did not stay on the lock we see below. There was movement happening.

root@g1:~# pstack 1308 | demangle
1308:   /opt/oxide/sled-agent/sled-agent run /opt/oxide/sled-agent/pkg/config.
--------------------- thread# 1 / lwp# 1 ---------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
 fffffc7fef0d0695 cond_wait_queue (5d4abe0, 5d4abf0, 0) + 55
 fffffc7fef0d0d1a __cond_wait (5d4abe0, 5d4abf0) + ba
 fffffc7fef0d0d5e cond_wait (5d4abe0, 5d4abf0) + 2e
 fffffc7fef0d0da5 pthread_cond_wait (5d4abe0, 5d4abf0) + 15
 000000000518236b parking_lot::condvar::Condvar::wait_until_internal::he1dde2516267dee7 () + 2ab
 0000000005141ce0 tokio::runtime::park::Inner::park::h90ed55b1dd6b2056 (.llvm.946546321561055788) () + 60
 0000000001f017ba tokio::runtime::park::CachedParkThread::block_on::h7c522107511bc19d () + 8fa
 0000000002006cea tokio::runtime::context::runtime::enter_runtime::h13e6d78c542a74bc () + 14a
 0000000001f01a0d tokio::runtime::runtime::Runtime::block_on::h8e22f9c0053abe06 () + 4d
 000000000209d883 oxide_tokio_rt::run::h4e0c43b3796fa8cf () + 93
 000000000205aa12 sled_agent::main::h91ff400535bac537 () + 22
 0000000001e21f06 std::sys::backtrace::__rust_begin_short_backtrace::hb23acd1dd06db521 () + 6
 0000000001e21efc std::rt::lang_start::_$u7b$$u7b$closure$u7d$$u7d$::h25480481a30c1af9 (.llvm.13015844564475507567) () + c
 00000000053802d9 std::rt::lang_start_internal::h58f1a8b72a792a85 () + 459
 00000000020863ec main () + 2c
 0000000001b33c37 _start_crt () + 87
 0000000001b33b98 _start () + 18
--------- thread# 2 / lwp# 2 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
 fffffc7fef0d0695 cond_wait_queue (5d4ab70, 5d4ab80, 0) + 55
 fffffc7fef0d0d1a __cond_wait (5d4ab70, 5d4ab80) + ba
 fffffc7fef0d0d5e cond_wait (5d4ab70, 5d4ab80) + 2e
 fffffc7fef0d0da5 pthread_cond_wait (5d4ab70, 5d4ab80) + 15
 000000000518236b parking_lot::condvar::Condvar::wait_until_internal::he1dde2516267dee7 () + 2ab
 0000000005142cc0 tokio::runtime::scheduler::multi_thread::park::Parker::park::h29d6f33cf7b11a5c () + 100
 000000000512e809 tokio::runtime::scheduler::multi_thread::worker::Context::park_timeout::h8a4a3bc289da7926 () + 89
 000000000512ffef tokio::runtime::scheduler::multi_thread::worker::Context::run::hbd9907b8a97dea57 () + 14bf
 00000000051532fa tokio::runtime::context::scoped::Scoped<T>::set::hf62e041f0105dd77 () + 2a
 0000000005142a64 tokio::runtime::context::runtime::enter_runtime::hfb5ad9c853d50470 () + b4
 000000000512d270 tokio::runtime::scheduler::multi_thread::worker::run::h4c23cd65bb474ece () + a0
 000000000516cd10 tokio::runtime::task::core::Core<T,S>::poll::h0a43f808cab5d70d () + 70
 000000000512709a tokio::runtime::task::harness::Harness<T,S>::poll::h175776a768aa7588 () + 8a
 000000000516a044 tokio::runtime::blocking::pool::Inner::run::h1f1ef31457761b6f () + f4
 000000000514caee std::sys::backtrace::__rust_begin_short_backtrace::hc94f0c1cf2b619f4 () + 3e
 000000000513da28 core::ops::function::FnOnce::call_once{{vtable.shim}}::hb1bdf8e88451f6c6 () + a8
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee540240) + 77
 fffffc7fef0d72e0 _lwp_start ()
--------- thread# 3 / lwp# 3 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
--------- thread# 4 / lwp# 4 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
--------- thread# 5 / lwp# 5 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
--------- thread# 6 / lwp# 6 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
 fffffc7fef0d0695 cond_wait_queue (5d4a9b0, 5d4a9c0, 0) + 55
 fffffc7fef0d0d1a __cond_wait (5d4a9b0, 5d4a9c0) + ba
 fffffc7fef0d0d5e cond_wait (5d4a9b0, 5d4a9c0) + 2e
 fffffc7fef0d0da5 pthread_cond_wait (5d4a9b0, 5d4a9c0) + 15
 000000000518236b parking_lot::condvar::Condvar::wait_until_internal::he1dde2516267dee7 () + 2ab
 0000000005142cc0 tokio::runtime::scheduler::multi_thread::park::Parker::park::h29d6f33cf7b11a5c () + 100
 000000000512e809 tokio::runtime::scheduler::multi_thread::worker::Context::park_timeout::h8a4a3bc289da7926 () + 89
 000000000512ffef tokio::runtime::scheduler::multi_thread::worker::Context::run::hbd9907b8a97dea57 () + 14bf
 00000000051532fa tokio::runtime::context::scoped::Scoped<T>::set::hf62e041f0105dd77 () + 2a
 0000000005142a64 tokio::runtime::context::runtime::enter_runtime::hfb5ad9c853d50470 () + b4
 000000000512d270 tokio::runtime::scheduler::multi_thread::worker::run::h4c23cd65bb474ece () + a0
 000000000516cd10 tokio::runtime::task::core::Core<T,S>::poll::h0a43f808cab5d70d () + 70
 000000000512709a tokio::runtime::task::harness::Harness<T,S>::poll::h175776a768aa7588 () + 8a
 000000000516a044 tokio::runtime::blocking::pool::Inner::run::h1f1ef31457761b6f () + f4
 000000000514caee std::sys::backtrace::__rust_begin_short_backtrace::hc94f0c1cf2b619f4 () + 3e
 000000000513da28 core::ops::function::FnOnce::call_once{{vtable.shim}}::hb1bdf8e88451f6c6 () + a8
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee542240) + 77
 fffffc7fef0d72e0 _lwp_start ()
--------- thread# 7 / lwp# 7 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
--------- thread# 8 / lwp# 8 [tokio-runtime-worker] ----------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
 fffffc7fef0d0695 cond_wait_queue (5d4a860, 5d4a870, 0) + 55
 fffffc7fef0d0d1a __cond_wait (5d4a860, 5d4a870) + ba
 fffffc7fef0d0d5e cond_wait (5d4a860, 5d4a870) + 2e
 fffffc7fef0d0da5 pthread_cond_wait (5d4a860, 5d4a870) + 15
 000000000518236b parking_lot::condvar::Condvar::wait_until_internal::he1dde2516267dee7 () + 2ab
 0000000005142cc0 tokio::runtime::scheduler::multi_thread::park::Parker::park::h29d6f33cf7b11a5c () + 100
 000000000512e809 tokio::runtime::scheduler::multi_thread::worker::Context::park_timeout::h8a4a3bc289da7926 () + 89
 000000000512ffef tokio::runtime::scheduler::multi_thread::worker::Context::run::hbd9907b8a97dea57 () + 14bf
 00000000051532fa tokio::runtime::context::scoped::Scoped<T>::set::hf62e041f0105dd77 () + 2a
 0000000005142a64 tokio::runtime::context::runtime::enter_runtime::hfb5ad9c853d50470 () + b4
 000000000512d270 tokio::runtime::scheduler::multi_thread::worker::run::h4c23cd65bb474ece () + a0
 000000000516cd10 tokio::runtime::task::core::Core<T,S>::poll::h0a43f808cab5d70d () + 70
 000000000512709a tokio::runtime::task::harness::Harness<T,S>::poll::h175776a768aa7588 () + 8a
 000000000516a044 tokio::runtime::blocking::pool::Inner::run::h1f1ef31457761b6f () + f4
 000000000514caee std::sys::backtrace::__rust_begin_short_backtrace::hc94f0c1cf2b619f4 () + 3e
 000000000513da28 core::ops::function::FnOnce::call_once{{vtable.shim}}::hb1bdf8e88451f6c6 () + a8
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee543240) + 77
 fffffc7fef0d72e0 _lwp_start ()
--------- thread# 9 / lwp# 9 [tokio-runtime-worker] ----------
 fffffc7fef0cf060 mutex_lock ()
 0000000001f6ab3c <tokio::io::split::ReadHalf<T> as tokio::io::async_read::AsyncRead>::poll_read::hcebf60213cb1dd33 () + 2c
 0000000001e4883d <core::future::poll_fn::PollFn<F> as core::future::future::Future>::poll::h8020c44d0240c47a () + 4fd
 0000000001b9a726 trust_quorum::established_conn::EstablishedConn::run::{{closure}}::hc36c36349794365f () + 436
 0000000001ba17ea trust_quorum::connection_manager::ConnMgr::accept::_$u7b$$u7b$closure$u7d$$u7d$::_$u7b$$u7b$closure$u7d$$u7d$::h3cb3600175d95ab3 (.llvm.15265051623724031513) () + 71a
 0000000001c2235b tokio::runtime::task::core::Core<T,S>::poll::hb2fe6e480072c257 () + 3b
 0000000001b673fd tokio::runtime::task::harness::Harness<T,S>::poll::hc97b29be0217cddc () + 5d
 0000000005130a53 tokio::runtime::scheduler::multi_thread::worker::Context::run_task::hed800a7a940a1ca4 () + 243
 000000000512fade tokio::runtime::scheduler::multi_thread::worker::Context::run::hbd9907b8a97dea57 () + fae
 00000000051532fa tokio::runtime::context::scoped::Scoped<T>::set::hf62e041f0105dd77 () + 2a
 0000000005142a64 tokio::runtime::context::runtime::enter_runtime::hfb5ad9c853d50470 () + b4
 000000000512d270 tokio::runtime::scheduler::multi_thread::worker::run::h4c23cd65bb474ece () + a0
 000000000516cd10 tokio::runtime::task::core::Core<T,S>::poll::h0a43f808cab5d70d () + 70
 000000000512709a tokio::runtime::task::harness::Harness<T,S>::poll::h175776a768aa7588 () + 8a
 000000000516a044 tokio::runtime::blocking::pool::Inner::run::h1f1ef31457761b6f () + f4
 000000000514caee std::sys::backtrace::__rust_begin_short_backtrace::hc94f0c1cf2b619f4 () + 3e
 000000000513da28 core::ops::function::FnOnce::call_once{{vtable.shim}}::hb1bdf8e88451f6c6 () + a8
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee543a40) + 77
 fffffc7fef0d72e0 _lwp_start ()
-------------------- thread# 10 / lwp# 10 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
 fffffc7fef0d0695 cond_wait_queue (5d4a578, 5d4a590, 0) + 55
 fffffc7fef0d0d1a __cond_wait (5d4a578, 5d4a590) + ba
 fffffc7fef0d0d5e cond_wait (5d4a578, 5d4a590) + 2e
 fffffc7fef0d0da5 pthread_cond_wait (5d4a578, 5d4a590) + 15
 000000000539341b std::thread::Thread::park::hb57adee7634cc03d () + 5b
 0000000005392eda std::thread::park::hde6e06d33c5c5f3c () + 4a
 0000000004b0770e crossbeam_channel::flavors::array::Channel<T>::recv::{{closure}}::hab8ef1c8a69be46d () + 19e
 0000000004b07061 crossbeam_channel::flavors::array::Channel<T>::recv::h72829eb28fa3f39a () + 2a1
 0000000004b2597d crossbeam_channel::channel::Receiver<T>::recv::h3b80d8dbf1b334a6 () + 3d
 0000000004aefecb std::sys::backtrace::__rust_begin_short_backtrace::h4ed6034d94972c62 () + 6b
 00000000023ed37c core::ops::function::FnOnce::call_once{{vtable.shim}}::hf60a80fb5bbb05e0 () + 11c
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee544240) + 77
 fffffc7fef0d72e0 _lwp_start ()
-------------------- thread# 11 / lwp# 11 --------------------
 fffffc7fef0ddbca ioctl    (a, 63746502, 5d49c00)
------------- thread# 13 / lwp# 13 [umem_update] -------------
 fffffc7fef0d7327 lwp_park (0, fffffc7fec9fee30, 0)
 fffffc7fef0d0695 cond_wait_queue (fffffc7fef23d710, fffffc7fef23d6f0, fffffc7fec9fee30) + 55
 fffffc7fef0d0ad5 cond_wait_common (fffffc7fef23d710, fffffc7fef23d6f0, fffffc7fec9fee30) + 1c5
 fffffc7fef0d0e67 __cond_timedwait (fffffc7fef23d710, fffffc7fef23d6f0, 3, fffffc7fec9fef50) + a7
 fffffc7fef0d0f0a cond_clockwait (fffffc7fef23d710, fffffc7fef23d6f0, 3, fffffc7fec9fef50) + 4a
 fffffc7fef0d0f83 cond_timedwait (fffffc7fef23d710, fffffc7fef23d6f0, fffffc7fec9fef50) + 23
 fffffc7fef1f9c06 umem_update_thread (0) + 1d6
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee545a40) + 77
 fffffc7fef0d72e0 _lwp_start ()
-------------------- thread# 14 / lwp# 14 --------------------
 fffffc7fef0de6aa waitid   (0, 567, fffffc7fea3ffaf0, 3)
 fffffc7fef075c7c waitpid (567, fffffc7fea3ffc3c, 0) + 3c
 000000000539453d std::process::Child::wait::he362f24a0c744b79 () + 5d
 0000000003030e73 sled_hardware::illumos::monitor_tofino::h02e1fad4aed43e07 () + 403
 0000000003040e54 std::sys::backtrace::__rust_begin_short_backtrace::h2388726b91a2bd97 () + 24
 0000000003049488 core::ops::function::FnOnce::call_once{{vtable.shim}}::h237e0bfd325f00b2 () + b8
 000000000538b0ef std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f () + 2f
 fffffc7fef0d6f97 _thrp_setup (fffffc7fee546240) + 77
 fffffc7fef0d72e0 _lwp_start ()
-------------------- thread# 16 / lwp# 16 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 17 / lwp# 17 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 18 / lwp# 18 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 19 / lwp# 19 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 20 / lwp# 20 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 21 / lwp# 21 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 22 / lwp# 22 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 23 / lwp# 23 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 25 / lwp# 25 --------------------
 fffffc7fef0d7327 lwp_park (0, 0, 0)
-------------------- thread# 33 / lwp# 33 --------------------
 000000000538b0c0 std::sys::thread::unix::Thread::new::thread_start::ha097fb815024401f(), exit value = 0x0000000000000000
        ** zombie (exited, not detached, not yet joined) **

Dtrace

The following dtrace scripts showed no signs of life

Kernel threads

root@g1:~# mdb -k -e '0t1308::pid2proc | ::walk thread | ::stacks'
THREAD           STATE    SOBJ                COUNT
fffffe0bd9c167c0 SLEEP    CV                     18
                 swtch+0x139
                 cv_wait_sig_swap_core+0x160
                 cv_wait_sig_swap+0x17
                 cv_waituntil_sig+0xcd
                 lwp_park+0x13f
                 syslwp_park+0x85

fffffe0bdb0c17a0 SLEEP    CV                      1
                 swtch+0x139
                 cv_timedwait_sig_hires+0x2cd
                 cv_waituntil_sig+0x105
                 lwp_park+0x13f
                 syslwp_park+0x85

fffffe0bdda36b60 SLEEP    CV                      1
                 swtch+0x139
                 cv_wait_sig+0x154
                 cte_get_event+0x93
                 ctfs_endpoint_ioctl+0xe1
                 ctfs_bu_ioctl+0x4e
                 fop_ioctl+0x40
                 ioctl+0x144

fffffe0bdc4fb860 SLEEP    CV                      1
                 swtch+0x139
                 cv_wait_sig_swap_core+0x160
                 cv_wait_sig_swap+0x17
                 waitid+0x29e
                 waitsys+0x35

fffffe0bdda35b80 ONPROC   <NONE>                  1
                 cpu_decay+0x2a
                 cpu_grow+0x24
                 cpu_update_pct+0xa5
                 new_mstate+0x25a
                 trap+0x79a
                 sys_rtt_common+0x18f

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions