diff --git a/drivers/overlay/ov_network.go b/drivers/overlay/ov_network.go index 994de93ae6..12bf872c8d 100644 --- a/drivers/overlay/ov_network.go +++ b/drivers/overlay/ov_network.go @@ -696,6 +696,12 @@ func (n *network) initSandbox(restore bool) error { var nlSock *nl.NetlinkSocket sbox.InvokeFunc(func() { nlSock, err = nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH) + if err != nil { + return + } + // set the receive timeout to not remain stuck on the RecvFrom if the fd gets closed + tv := syscall.NsecToTimeval(soTimeout.Nanoseconds()) + err = nlSock.SetReceiveTimeout(&tv) }) n.setNetlinkSocket(nlSock) @@ -721,6 +727,11 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { // The netlink socket got closed, simply exit to not leak this goroutine return } + // When the receive timeout expires the receive will return EAGAIN + if err == syscall.EAGAIN { + // we continue here to avoid spam for timeouts + continue + } logrus.Errorf("Failed to receive from netlink: %v ", err) continue } diff --git a/drivers/overlay/overlay_test.go b/drivers/overlay/overlay_test.go index 6d2127311d..75c89da6bb 100644 --- a/drivers/overlay/overlay_test.go +++ b/drivers/overlay/overlay_test.go @@ -1,7 +1,9 @@ package overlay import ( + "context" "net" + "syscall" "testing" "time" @@ -12,6 +14,7 @@ import ( "github.com/docker/libnetwork/driverapi" "github.com/docker/libnetwork/netlabel" _ "github.com/docker/libnetwork/testutils" + "github.com/vishvananda/netlink/nl" ) func init() { @@ -135,3 +138,36 @@ func TestOverlayType(t *testing.T) { dt.d.Type()) } } + +// Test that the netlink socket close unblock the watchMiss to avoid deadlock +func TestNetlinkSocket(t *testing.T) { + // This is the same code used by the overlay driver to create the netlink interface + // for the watch miss + nlSock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH) + if err != nil { + t.Fatal() + } + // set the receive timeout to not remain stuck on the RecvFrom if the fd gets closed + tv := syscall.NsecToTimeval(soTimeout.Nanoseconds()) + err = nlSock.SetReceiveTimeout(&tv) + if err != nil { + t.Fatal() + } + n := &network{id: "testnetid"} + ch := make(chan error) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + go func() { + n.watchMiss(nlSock) + ch <- nil + }() + time.Sleep(5 * time.Second) + nlSock.Close() + select { + case <-ch: + case <-ctx.Done(): + { + t.Fatalf("Timeout expired") + } + } +} diff --git a/ipvs/ipvs.go b/ipvs/ipvs.go index 266cc24dbe..e5f991f15c 100644 --- a/ipvs/ipvs.go +++ b/ipvs/ipvs.go @@ -5,11 +5,17 @@ package ipvs import ( "net" "syscall" + "time" "github.com/vishvananda/netlink/nl" "github.com/vishvananda/netns" ) +const ( + netlinkRecvSocketsTimeout = 3 * time.Second + netlinkSendSocketTimeout = 30 * time.Second +) + // Service defines an IPVS service in its entirety. type Service struct { // Virtual service address. @@ -66,6 +72,15 @@ func New(path string) (*Handle, error) { if err != nil { return nil, err } + // Add operation timeout to avoid deadlocks + tv := syscall.NsecToTimeval(netlinkSendSocketTimeout.Nanoseconds()) + if err := sock.SetSendTimeout(&tv); err != nil { + return nil, err + } + tv = syscall.NsecToTimeval(netlinkRecvSocketsTimeout.Nanoseconds()) + if err := sock.SetReceiveTimeout(&tv); err != nil { + return nil, err + } return &Handle{sock: sock}, nil } diff --git a/ipvs/netlink.go b/ipvs/netlink.go index 635606dacd..499ed0c1ef 100644 --- a/ipvs/netlink.go +++ b/ipvs/netlink.go @@ -186,10 +186,6 @@ func newGenlRequest(familyID int, cmd uint8) *nl.NetlinkRequest { } func execute(s *nl.NetlinkSocket, req *nl.NetlinkRequest, resType uint16) ([][]byte, error) { - var ( - err error - ) - if err := s.Send(req); err != nil { return nil, err } @@ -205,6 +201,13 @@ done: for { msgs, err := s.Receive() if err != nil { + if s.GetFd() == -1 { + return nil, fmt.Errorf("Socket got closed on receive") + } + if err == syscall.EAGAIN { + // timeout fired + continue + } return nil, err } for _, m := range msgs { diff --git a/vendor.conf b/vendor.conf index 203e1cebf5..17a45b3a86 100644 --- a/vendor.conf +++ b/vendor.conf @@ -37,7 +37,7 @@ github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1 github.com/stretchr/testify dab07ac62d4905d3e48d17dc549c684ac3b7c15a github.com/syndtr/gocapability 2c00daeb6c3b45114c80ac44119e7b8801fdd852 github.com/ugorji/go f1f1a805ed361a0e078bb537e4ea78cd37dcf065 -github.com/vishvananda/netlink bd6d5de5ccef2d66b0a26177928d0d8895d7f969 +github.com/vishvananda/netlink aebee77a166882d633b1abc167d59f251b24f11d https://github.com/fcrisciani/netlink github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25 golang.org/x/net c427ad74c6d7a814201695e9ffde0c5d400a7674 golang.org/x/sys 8f0908ab3b2457e2e15403d3697c9ef5cb4b57a9 diff --git a/vendor/github.com/vishvananda/netlink/handle_linux.go b/vendor/github.com/vishvananda/netlink/handle_linux.go index a04ceae6b6..b217c220dc 100644 --- a/vendor/github.com/vishvananda/netlink/handle_linux.go +++ b/vendor/github.com/vishvananda/netlink/handle_linux.go @@ -45,13 +45,10 @@ func (h *Handle) SetSocketTimeout(to time.Duration) error { } tv := syscall.NsecToTimeval(to.Nanoseconds()) for _, sh := range h.sockets { - fd := sh.Socket.GetFd() - err := syscall.SetsockoptTimeval(fd, syscall.SOL_SOCKET, syscall.SO_RCVTIMEO, &tv) - if err != nil { + if err := sh.Socket.SetSendTimeout(&tv); err != nil { return err } - err = syscall.SetsockoptTimeval(fd, syscall.SOL_SOCKET, syscall.SO_SNDTIMEO, &tv) - if err != nil { + if err := sh.Socket.SetReceiveTimeout(&tv); err != nil { return err } } diff --git a/vendor/github.com/vishvananda/netlink/nl/nl_linux.go b/vendor/github.com/vishvananda/netlink/nl/nl_linux.go index 1329acd864..72f7f6af3c 100644 --- a/vendor/github.com/vishvananda/netlink/nl/nl_linux.go +++ b/vendor/github.com/vishvananda/netlink/nl/nl_linux.go @@ -621,6 +621,20 @@ func (s *NetlinkSocket) Receive() ([]syscall.NetlinkMessage, error) { return syscall.ParseNetlinkMessage(rb) } +// SetSendTimeout allows to set a send timeout on the socket +func (s *NetlinkSocket) SetSendTimeout(timeout *syscall.Timeval) error { + // Set a send timeout of SOCKET_SEND_TIMEOUT, this will allow the Send to periodically unblock and avoid that a routine + // remains stuck on a send on a closed fd + return syscall.SetsockoptTimeval(int(s.fd), syscall.SOL_SOCKET, syscall.SO_SNDTIMEO, timeout) +} + +// SetReceiveTimeout allows to set a receive timeout on the socket +func (s *NetlinkSocket) SetReceiveTimeout(timeout *syscall.Timeval) error { + // Set a read timeout of SOCKET_READ_TIMEOUT, this will allow the Read to periodically unblock and avoid that a routine + // remains stuck on a recvmsg on a closed fd + return syscall.SetsockoptTimeval(int(s.fd), syscall.SOL_SOCKET, syscall.SO_RCVTIMEO, timeout) +} + func (s *NetlinkSocket) GetPid() (uint32, error) { fd := int(atomic.LoadInt32(&s.fd)) lsa, err := syscall.Getsockname(fd)