diff --git a/core/node/provider.go b/core/node/provider.go index 2c77e580c8d..fee95b0ff85 100644 --- a/core/node/provider.go +++ b/core/node/provider.go @@ -39,11 +39,21 @@ import ( // The size of a batch that will be used for calculating average announcement // time per CID, inside of boxo/provider.ThroughputReport // and in 'ipfs stats provide' report. +// Used when Provide.DHT.SweepEnabled=false const sampledBatchSize = 1000 // Datastore key used to store previous reprovide strategy. const reprovideStrategyKey = "/reprovideStrategy" +// Interval between reprovide queue monitoring checks for slow reprovide alerts. +// Used when Provide.DHT.SweepEnabled=true +const reprovideAlertPollInterval = 15 * time.Minute + +// Number of consecutive polling intervals with sustained queue growth before +// triggering a slow reprovide alert (3 intervals = 45 minutes). +// Used when Provide.DHT.SweepEnabled=true +const consecutiveAlertsThreshold = 3 + // DHTProvider is an interface for providing keys to a DHT swarm. It holds a // state of keys to be advertised, and is responsible for periodically // publishing provider records for these keys to the DHT swarm before the @@ -508,9 +518,127 @@ func SweepingProviderOpt(cfg *config.Config) fx.Option { }) }) + // extractSweepingProvider extracts a SweepingProvider from the given provider interface. + // It handles unwrapping buffered and dual providers, always selecting WAN for dual DHT. + // Returns nil if the provider is not a sweeping provider type. + var extractSweepingProvider func(prov any) *dhtprovider.SweepingProvider + extractSweepingProvider = func(prov any) *dhtprovider.SweepingProvider { + switch p := prov.(type) { + case *dhtprovider.SweepingProvider: + return p + case *ddhtprovider.SweepingProvider: + return p.WAN + case *buffered.SweepingProvider: + // Recursively extract from the inner provider + return extractSweepingProvider(p.Provider) + default: + return nil + } + } + + type alertInput struct { + fx.In + Provider DHTProvider + } + reprovideAlert := fx.Invoke(func(lc fx.Lifecycle, in alertInput) { + prov := extractSweepingProvider(in.Provider) + + var ( + cancel context.CancelFunc + done = make(chan struct{}) + ) + + lc.Append(fx.Hook{ + OnStart: func(ctx context.Context) error { + if prov == nil { + return nil + } + gcCtx, c := context.WithCancel(context.Background()) + cancel = c + go func() { + defer close(done) + + ticker := time.NewTicker(reprovideAlertPollInterval) + defer ticker.Stop() + + var ( + queueSize, prevQueueSize int + queuedWorkers, prevQueuedWorkers bool + count int + ) + + for { + select { + case <-gcCtx.Done(): + return + case <-ticker.C: + } + + stats := prov.Stats() + queuedWorkers = stats.Workers.QueuedPeriodic > 0 + queueSize = stats.Queues.PendingRegionReprovides + + // Alert if reprovide queue keeps growing and all periodic workers are busy. + // Requires consecutiveAlertsThreshold intervals of sustained growth. + if prevQueuedWorkers && queuedWorkers && queueSize > prevQueueSize { + count++ + if count >= consecutiveAlertsThreshold { + logger.Errorf(` +🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔 + +Your node is falling behind on DHT reprovides, which will affect content availability. + +Keyspace regions enqueued for reprovide: + %s ago:\t%d + Now:\t%d + +All periodic workers are busy! + Active workers:\t%d / %d (max) + Active workers types:\t%d periodic, %d burst + Dedicated workers:\t%d periodic, %d burst + +Solutions (try in order): +1. Increase Provide.DHT.MaxWorkers (current %d) +2. Increase Provide.DHT.DedicatedPeriodicWorkers (current %d) +3. Set Provide.DHT.SweepEnabled=false and Routing.AcceleratedDHTClient=true (last resort, not recommended) + +See how the reprovide queue is processed in real-time with 'watch ipfs provide stat --all --compact' + +See docs: https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtmaxworkers`, + reprovideAlertPollInterval.Truncate(time.Minute).String(), prevQueueSize, queueSize, + stats.Workers.Active, stats.Workers.Max, + stats.Workers.ActivePeriodic, stats.Workers.ActiveBurst, + stats.Workers.DedicatedPeriodic, stats.Workers.DedicatedBurst, + stats.Workers.Max, stats.Workers.DedicatedPeriodic) + } + } else if !queuedWorkers { + count = 0 + } + + prevQueueSize, prevQueuedWorkers = queueSize, queuedWorkers + } + }() + return nil + }, + OnStop: func(ctx context.Context) error { + // Cancel the alert loop + if cancel != nil { + cancel() + } + select { + case <-done: + case <-ctx.Done(): + return ctx.Err() + } + return nil + }, + }) + }) + return fx.Options( sweepingReprovider, initKeystore, + reprovideAlert, ) } diff --git a/docs/changelogs/v0.39.md b/docs/changelogs/v0.39.md index c0a6522b32e..f992670178a 100644 --- a/docs/changelogs/v0.39.md +++ b/docs/changelogs/v0.39.md @@ -11,6 +11,7 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team. - [Overview](#overview) - [ðŸ”Ķ Highlights](#-highlights) - [📊 Detailed statistics for Sweep provider with `ipfs provide stat`](#-detailed-statistics-for-sweep-provider-with-ipfs-provide-stat) + - [🔔 Sweep provider slow reprovide warnings](#-sweep-provider-slow-reprovide-warnings) - [ðŸŠĶ Deprecated `go-ipfs` name no longer published](#-deprecated-go-ipfs-name-no-longer-published) - [ðŸ“Ķïļ Important dependency updates](#-important-dependency-updates) - [📝 Changelog](#-changelog) @@ -61,6 +62,21 @@ provider statistics instead of the default WAN DHT stats. > [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled)). > Legacy provider shows basic statistics without flag support. +#### 🔔 Sweep provider slow reprovide warnings + +Kubo now monitors DHT reprovide operations when `Provide.DHT.SweepEnabled=true` +and alerts you if your node is falling behind on reprovides. + +When the reprovide queue consistently grows and all periodic workers are busy, +a warning displays with: + +- Queue size and worker utilization details +- Recommended solutions: increase `Provide.DHT.MaxWorkers` or `Provide.DHT.DedicatedPeriodicWorkers` +- Command to monitor real-time progress: `watch ipfs provide stat --all --compact` + +The alert polls every 15 minutes and only triggers after sustained growth +across multiple intervals. The legacy provider is unaffected by this change. + #### ðŸŠĶ Deprecated `go-ipfs` name no longer published The `go-ipfs` name was deprecated in 2022 and renamed to `kubo`. Starting with this release, we have stopped publishing Docker images and distribution binaries under the old `go-ipfs` name.