Skip to content

Commit 6682e88

Browse files
committed
libp2p: rework retry logic
1 parent d905f8e commit 6682e88

6 files changed

Lines changed: 51 additions & 59 deletions

File tree

cmd/nebula/cmd.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ var rootConfig = &config.Root{
3434
LogFormat: "text",
3535
LogDisableColor: false,
3636
LogErrors: false,
37-
DialTimeout: 10 * time.Second,
37+
DialTimeout: 15 * time.Second,
3838
MetricsHost: "0.0.0.0",
3939
MetricsPort: 6666,
4040
TracesHost: "", // disabled

cmd/nebula/cmd_crawl.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,10 @@ func CrawlAction(c *cli.Context) error {
219219
// Allow transient connections. This way we can crawl a peer even if it is relayed.
220220
ctx = network.WithUseTransient(ctx, "reach peers behind NATs")
221221

222+
// This is a custom configuration option that only exists in our fork of go-libp2p.
223+
// see: https://github.com/plprobelab/go-libp2p/commit/f6d73ce3093ded293f0de032d239709069fac586
224+
ctx = network.WithDisableBackoff(ctx, "prevent backoff")
225+
222226
handlerCfg := &core.CrawlHandlerConfig{
223227
TrackNeighbors: cfg.PersistNeighbors,
224228
}

discv5/crawler.go

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"sync"
1010
"time"
1111

12+
"github.com/cenkalti/backoff/v4"
1213
"github.com/ethereum/go-ethereum/p2p/enode"
1314
"github.com/libp2p/go-libp2p/core/network"
1415
"github.com/libp2p/go-libp2p/core/peer"
@@ -241,11 +242,13 @@ func (c *Crawler) connect(ctx context.Context, pi peer.AddrInfo) error {
241242
return fmt.Errorf("skipping node as it has no public IP address") // change knownErrs map if changing this msg
242243
}
243244

244-
var (
245-
retry int = 0
246-
maxRetries int = 2
247-
firstErr error = nil
248-
)
245+
// init an exponential backoff
246+
bo := backoff.NewExponentialBackOff()
247+
bo.InitialInterval = time.Second
248+
bo.MaxInterval = 10 * time.Second
249+
bo.MaxElapsedTime = time.Minute
250+
251+
var retry int = 0
249252

250253
for {
251254
logEntry := log.WithFields(log.Fields{
@@ -266,44 +269,36 @@ func (c *Crawler) connect(ctx context.Context, pi peer.AddrInfo) error {
266269
return nil
267270
}
268271

269-
// at this point we know something went wrong. Track the first error
270-
// because subsequent connection attempts have a shorter timeout which
271-
// means that it's more likely to run into a context.DeadlineExceeded
272-
// error. If that's the case, we return the original error for tracking
273-
// purposes. If the error is nil, we're not connected and not identified
274-
// firstErr will stay nil. This is fine because we'll track that outside
275-
// of this connect call.
276-
if firstErr == nil {
277-
firstErr = err
278-
}
279-
280272
switch true {
281273
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorConnectionRefused]):
274+
// Might be transient because the remote doesn't want us to connect. Try again!
282275
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorConnectionGated]):
276+
// Hints at a configuration issue and should not happen, but if it
277+
// does it could be transient. Try again anyway, but at least log a warning.
278+
logEntry.WithError(err).Warnln("Connection gated!")
283279
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorCantAssignRequestedAddress]):
284-
// case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorConnectionResetByPeer]): often because of mismatching peer ids, so excluding it for now
280+
// Transient error due to local UDP issues. Try again!
281+
case strings.Contains(err.Error(), "dial backoff"):
282+
// should not happen because we disabled backoff checks with our
283+
// go-libp2p fork. Try again anyway, but at least log a warning.
284+
logEntry.WithError(err).Warnln("Dial backoff!")
285+
case strings.Contains(err.Error(), "RESOURCE_LIMIT_EXCEEDED (201)"): // thrown by a circuit relay
286+
// We already have too many open connections over a relay. Try again!
285287
default:
286-
if errors.Is(err, context.DeadlineExceeded) {
287-
err = firstErr
288-
}
289288
logEntry.WithError(err).Debugln("Failed connecting to peer", pi.ID.ShortString())
290289
return err
291290
}
292291

293-
if retry == maxRetries {
294-
if errors.Is(err, context.DeadlineExceeded) {
295-
err = firstErr
296-
}
292+
sleepDur := bo.NextBackOff()
293+
if sleepDur == backoff.Stop {
297294
logEntry.WithError(err).Debugln("Exceeded retries connecting to peer", pi.ID.ShortString())
298295
return err
299296
}
300297

301-
sleep := time.Second * time.Duration(3*(retry+1)) // TODO: parameterize
302-
303298
select {
304299
case <-ctx.Done():
305300
return ctx.Err()
306-
case <-time.After(sleep):
301+
case <-time.After(sleepDur):
307302
retry += 1
308303
continue
309304
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ require (
4242
golang.org/x/sync v0.3.0
4343
)
4444

45+
replace github.com/libp2p/go-libp2p v0.28.3 => github.com/plprobelab/go-libp2p v0.0.0-20231206123035-f6d73ce3093d // branch v0.28.3-nebula
46+
4547
require (
4648
github.com/StackExchange/wmi v1.2.1 // indirect
4749
github.com/benbjohnson/clock v1.3.5 // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,6 @@ github.com/libp2p/go-cidranger v1.1.0 h1:ewPN8EZ0dd1LSnrtuwd4709PXVcITVeuwbag38y
469469
github.com/libp2p/go-cidranger v1.1.0/go.mod h1:KWZTfSr+r9qEo9OkI9/SIEeAtw+NNoU0dXIXt15Okic=
470470
github.com/libp2p/go-flow-metrics v0.1.0 h1:0iPhMI8PskQwzh57jB9WxIuIOQ0r+15PChFGkx3Q3WM=
471471
github.com/libp2p/go-flow-metrics v0.1.0/go.mod h1:4Xi8MX8wj5aWNDAZttg6UPmc0ZrnFNsMtpsYUClFtro=
472-
github.com/libp2p/go-libp2p v0.28.3 h1:zEbLhMvqPF0lOkil9vOVM+7kXj1SWwiOV5p8J9Fw0Lw=
473-
github.com/libp2p/go-libp2p v0.28.3/go.mod h1:iEzd0V6Bai6Joi9MmxYFkUJT6WA1ca9pj6X4C0UjeS4=
474472
github.com/libp2p/go-libp2p-asn-util v0.3.0 h1:gMDcMyYiZKkocGXDQ5nsUQyquC9+H+iLEQHwOCZ7s8s=
475473
github.com/libp2p/go-libp2p-asn-util v0.3.0/go.mod h1:B1mcOrKUE35Xq/ASTmQ4tN3LNzVVaMNmq2NACuqyB9w=
476474
github.com/libp2p/go-libp2p-kad-dht v0.25.0 h1:T2SXQ/VlXTQVLChWY/+OyOsmGMRJvB5kiR+eJt7jtvI=
@@ -622,6 +620,8 @@ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
622620
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
623621
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
624622
github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg=
623+
github.com/plprobelab/go-libp2p v0.0.0-20231206123035-f6d73ce3093d h1:KX8qHDA2DC+zj0Sz+L2Xw+tzjrjIyQ7y0DOK/zT3LPY=
624+
github.com/plprobelab/go-libp2p v0.0.0-20231206123035-f6d73ce3093d/go.mod h1:iEzd0V6Bai6Joi9MmxYFkUJT6WA1ca9pj6X4C0UjeS4=
625625
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
626626
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
627627
github.com/polydawn/refmt v0.89.0 h1:ADJTApkvkeBZsN0tBTx8QjpD9JkmxbKp0cxfr9qszm4=

libp2p/crawler_p2p.go

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ package libp2p
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
76
"strings"
87
"sync"
98
"time"
109

10+
"github.com/cenkalti/backoff/v4"
1111
kbucket "github.com/libp2p/go-libp2p-kbucket"
1212
"github.com/libp2p/go-libp2p/core/network"
1313
"github.com/libp2p/go-libp2p/core/peer"
@@ -142,11 +142,14 @@ func (c *Crawler) connect(ctx context.Context, pi peer.AddrInfo) error {
142142
return fmt.Errorf("skipping node as it has no public IP address") // change knownErrs map if changing this msg
143143
}
144144

145-
var (
146-
retry int = 0
147-
maxRetries int = 3
148-
firstErr error = nil
149-
)
145+
// init an exponential backoff
146+
bo := backoff.NewExponentialBackOff()
147+
bo.InitialInterval = time.Second
148+
bo.MaxInterval = 10 * time.Second
149+
bo.MaxElapsedTime = time.Minute
150+
151+
// keep track of retries for debug logging
152+
retry := 0
150153

151154
for {
152155
logEntry := log.WithFields(log.Fields{
@@ -166,48 +169,36 @@ func (c *Crawler) connect(ctx context.Context, pi peer.AddrInfo) error {
166169
return nil
167170
}
168171

169-
// at this point we know something went wrong. Track the first error
170-
// because subsequent connection attempts have a shorter timeout which
171-
// means that it's more likely to run into a context.DeadlineExceeded
172-
// error. If that's the case, we return the original error for tracking
173-
// purposes.
174-
if firstErr == nil {
175-
firstErr = err
176-
}
177-
178172
switch true {
179173
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorConnectionRefused]):
180174
// Might be transient because the remote doesn't want us to connect. Try again!
181175
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorConnectionGated]):
182-
// Hints at a configuration issue but could be transient. Try again!
176+
// Hints at a configuration issue and should not happen, but if it
177+
// does it could be transient. Try again anyway, but at least log a warning.
178+
logEntry.WithError(err).Warnln("Connection gated!")
183179
case strings.Contains(err.Error(), db.ErrorStr[models.NetErrorCantAssignRequestedAddress]):
184180
// Transient error due to local UDP issues. Try again!
185-
maxRetries = 10 // increase the maximum number of retries as the error _should_ go away
186-
case strings.Contains(err.Error(), db.ErrorStr["RESOURCE_LIMIT_EXCEEDED (201)"]): // thrown by a circuit relay
181+
case strings.Contains(err.Error(), "dial backoff"):
182+
// should not happen because we disabled backoff checks with our
183+
// go-libp2p fork. Try again anyway, but at least log a warning.
184+
logEntry.WithError(err).Warnln("Dial backoff!")
185+
case strings.Contains(err.Error(), "RESOURCE_LIMIT_EXCEEDED (201)"): // thrown by a circuit relay
187186
// We already have too many open connections over a relay. Try again!
188-
maxRetries = 10 // increase the maximum number of retries as the error _should_ go away
189187
default:
190-
if errors.Is(err, context.DeadlineExceeded) {
191-
err = firstErr
192-
}
193188
logEntry.WithError(err).Debugln("Failed connecting to peer", pi.ID.ShortString())
194189
return err
195190
}
196191

197-
if retry == maxRetries {
198-
if errors.Is(err, context.DeadlineExceeded) {
199-
err = firstErr
200-
}
192+
sleepDur := bo.NextBackOff()
193+
if sleepDur == backoff.Stop {
201194
logEntry.WithError(err).Debugln("Exceeded retries connecting to peer", pi.ID.ShortString())
202195
return err
203196
}
204197

205-
sleep := time.Second * time.Duration(3*(retry+1)) // TODO: parameterize
206-
207198
select {
208199
case <-ctx.Done():
209200
return ctx.Err()
210-
case <-time.After(sleep):
201+
case <-time.After(sleepDur):
211202
retry += 1
212203
continue
213204
}

0 commit comments

Comments
 (0)