From 0295184a2855952c153ccce243453c7e09d6c21f Mon Sep 17 00:00:00 2001 From: Eric Wendland Date: Tue, 30 Jun 2026 03:35:03 +0200 Subject: [PATCH 1/2] fix(core): reload changed prefix health checks Keep existing running prefix health checks when a config reload contains an equivalent prefix configuration. If the same prefix changes health config, stop the old checker and start the new one before wiring the advertisement. This prevents config reloads from advertising a newly unmarshaled but unstarted health checker with its default metric. --- core/nylon_apply.go | 78 ++++++++++++++++++++++++---- core/nylon_apply_test.go | 109 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 core/nylon_apply_test.go diff --git a/core/nylon_apply.go b/core/nylon_apply.go index b5ad501b..6d5c9d80 100644 --- a/core/nylon_apply.go +++ b/core/nylon_apply.go @@ -137,16 +137,20 @@ func reconcileConfiguredEndpoints(neigh *state.Neighbour, desired []*state.Dynam } func (n *Nylon) reconcileAdvertisedPrefixes(next *state.CentralCfg) { - cur := n.GetRouter(n.LocalCfg.Id) - nextRouter := next.GetRouter(n.LocalCfg.Id) - currentLocal := make(map[netip.Prefix]state.PrefixHealthWrapper) - for _, prefix := range cur.Prefixes { - currentLocal[prefix.GetPrefix()] = prefix + if cur := n.CentralCfg.TryGetNode(n.LocalCfg.Id); cur != nil { + for _, prefix := range cur.Prefixes { + currentLocal[prefix.GetPrefix()] = prefix + } } - desiredLocal := make(map[netip.Prefix]state.PrefixHealthWrapper) - for _, prefix := range nextRouter.Prefixes { - desiredLocal[prefix.GetPrefix()] = prefix + nextNode := next.TryGetNode(n.LocalCfg.Id) + if nextNode == nil { + return + } + + desiredLocal := make(map[netip.Prefix]int) + for i, prefix := range nextNode.Prefixes { + desiredLocal[prefix.GetPrefix()] = i } for prefix, adv := range n.RouterState.Advertised { @@ -161,8 +165,15 @@ func (n *Nylon) reconcileAdvertisedPrefixes(next *state.CentralCfg) { } } - for prefix, desired := range desiredLocal { - if _, ok := currentLocal[prefix]; !ok { + for prefix, index := range desiredLocal { + desired := nextNode.Prefixes[index] + if current, ok := currentLocal[prefix]; ok && samePrefixHealthConfig(current, desired, &n.RouterTunables) { + desired = current + nextNode.Prefixes[index] = current + } else { + if current, ok := currentLocal[prefix]; ok { + current.Stop() + } n.Log.Debug("starting prefix healthcheck", "prefix", prefix) desired.Start(n.Log, &n.RouterTunables) } @@ -177,6 +188,53 @@ func (n *Nylon) reconcileAdvertisedPrefixes(next *state.CentralCfg) { } } +func samePrefixHealthConfig(a, b state.PrefixHealthWrapper, tunables *state.RouterTunables) bool { + switch av := a.PrefixHealth.(type) { + case *state.StaticPrefixHealth: + bv, ok := b.PrefixHealth.(*state.StaticPrefixHealth) + return ok && av.Prefix == bv.Prefix && av.Metric == bv.Metric + case *state.PingPrefixHealth: + bv, ok := b.PrefixHealth.(*state.PingPrefixHealth) + return ok && + av.Prefix == bv.Prefix && + av.Addr == bv.Addr && + av.BindIf == bv.BindIf && + equalUint32Ptr(av.Metric, bv.Metric) && + prefixHealthDelay(av.Delay, tunables) == prefixHealthDelay(bv.Delay, tunables) && + prefixHealthMaxFailures(av.MaxFailures, tunables) == prefixHealthMaxFailures(bv.MaxFailures, tunables) + case *state.HTTPPrefixHealth: + bv, ok := b.PrefixHealth.(*state.HTTPPrefixHealth) + return ok && + av.Prefix == bv.Prefix && + av.URL == bv.URL && + equalUint32Ptr(av.Metric, bv.Metric) && + prefixHealthDelay(av.Delay, tunables) == prefixHealthDelay(bv.Delay, tunables) + default: + return false + } +} + +func equalUint32Ptr(a, b *uint32) bool { + if a == nil || b == nil { + return a == b + } + return *a == *b +} + +func prefixHealthDelay(value *time.Duration, tunables *state.RouterTunables) time.Duration { + if value != nil { + return *value + } + return tunables.HealthCheckDelay +} + +func prefixHealthMaxFailures(value *int, tunables *state.RouterTunables) int { + if value != nil { + return *value + } + return tunables.HealthCheckMaxFailures +} + func (n *Nylon) startAdvertisedPrefixHealth() { for _, ph := range n.GetNode(n.LocalCfg.Id).Prefixes { n.Log.Debug("starting prefix healthcheck", "prefix", ph.GetPrefix()) diff --git a/core/nylon_apply_test.go b/core/nylon_apply_test.go new file mode 100644 index 00000000..94a4d7b8 --- /dev/null +++ b/core/nylon_apply_test.go @@ -0,0 +1,109 @@ +package core + +import ( + "io" + "log/slog" + "net/netip" + "testing" + "time" + + "github.com/encodeous/nylon/state" + "github.com/stretchr/testify/assert" +) + +func TestReconcileAdvertisedPrefixesStartsChangedPrefixHealth(t *testing.T) { + prefix := netip.MustParsePrefix("fd00::53/128") + oldPrefix := state.PrefixHealthWrapper{ + PrefixHealth: &state.StaticPrefixHealth{Prefix: prefix}, + } + n := testNylonWithPrefixes(oldPrefix) + n.RouterState.Advertised[prefix] = state.Advertisement{ + NodeId: n.LocalCfg.Id, + Expiry: maxConfigTime, + MetricFn: oldPrefix.GetMetric, + } + + delay := time.Millisecond + next := testCentralConfig(n.LocalCfg.Id, state.PrefixHealthWrapper{ + PrefixHealth: &state.HTTPPrefixHealth{ + Prefix: prefix, + URL: "http://127.0.0.1:1/healthz", + Delay: &delay, + }, + }) + + n.reconcileAdvertisedPrefixes(&next) + t.Cleanup(next.Routers[0].Prefixes[0].Stop) + + assert.Equal(t, state.INF, n.RouterState.Advertised[prefix].MetricFn()) +} + +func TestReconcileAdvertisedPrefixesReusesUnchangedRunningPrefixHealth(t *testing.T) { + prefix := netip.MustParsePrefix("fd00::53/128") + delay := time.Millisecond + current := state.PrefixHealthWrapper{ + PrefixHealth: &state.HTTPPrefixHealth{ + Prefix: prefix, + URL: "http://127.0.0.1:1/healthz", + Delay: &delay, + }, + } + n := testNylonWithPrefixes(current) + current.Start(n.Log, &n.RouterTunables) + t.Cleanup(current.Stop) + n.RouterState.Advertised[prefix] = state.Advertisement{ + NodeId: n.LocalCfg.Id, + Expiry: maxConfigTime, + MetricFn: current.GetMetric, + ExpiryFn: current.Stop, + } + + next := testCentralConfig(n.LocalCfg.Id, state.PrefixHealthWrapper{ + PrefixHealth: &state.HTTPPrefixHealth{ + Prefix: prefix, + URL: "http://127.0.0.1:1/healthz", + Delay: &delay, + }, + }) + + n.reconcileAdvertisedPrefixes(&next) + + assert.Same(t, current.PrefixHealth, next.Routers[0].Prefixes[0].PrefixHealth) + assert.Equal(t, state.INF, n.RouterState.Advertised[prefix].MetricFn()) +} + +func testNylonWithPrefixes(prefixes ...state.PrefixHealthWrapper) *Nylon { + id := state.NodeId("node") + tunables := state.DefaultRouterTunables() + return &Nylon{ + RouterTunables: tunables, + ConfigState: state.ConfigState{ + CentralCfg: testCentralConfig(id, prefixes...), + LocalCfg: state.LocalCfg{ + Id: id, + }, + }, + RouterState: &state.RouterState{ + RouterTunables: &tunables, + Id: id, + SelfSeqno: make(map[netip.Prefix]uint16), + Routes: make(map[netip.Prefix]state.SelRoute), + Sources: make(map[state.Source]state.FD), + Advertised: make(map[netip.Prefix]state.Advertisement), + }, + Log: slog.New(slog.NewTextHandler(io.Discard, nil)), + } +} + +func testCentralConfig(id state.NodeId, prefixes ...state.PrefixHealthWrapper) state.CentralCfg { + return state.CentralCfg{ + Routers: []state.RouterCfg{ + { + NodeCfg: state.NodeCfg{ + Id: id, + Prefixes: prefixes, + }, + }, + }, + } +} From b59f6672b004a9ed32c208b26a368db1e0ff9f56 Mon Sep 17 00:00:00 2001 From: Adam Chen Date: Thu, 2 Jul 2026 03:20:42 +0000 Subject: [PATCH 2/2] fix: cleanup and use INF metric before successful healthcheck --- core/nylon_apply.go | 49 +----------------------- core/nylon_apply_test.go | 27 ++++++++++++++ state/prefix_health.go | 80 +++++++++++++++++++++++++++++++++++----- 3 files changed, 98 insertions(+), 58 deletions(-) diff --git a/core/nylon_apply.go b/core/nylon_apply.go index 6d5c9d80..87a2ae51 100644 --- a/core/nylon_apply.go +++ b/core/nylon_apply.go @@ -167,7 +167,7 @@ func (n *Nylon) reconcileAdvertisedPrefixes(next *state.CentralCfg) { for prefix, index := range desiredLocal { desired := nextNode.Prefixes[index] - if current, ok := currentLocal[prefix]; ok && samePrefixHealthConfig(current, desired, &n.RouterTunables) { + if current, ok := currentLocal[prefix]; ok && current.SameConfig(desired, &n.RouterTunables) { desired = current nextNode.Prefixes[index] = current } else { @@ -188,53 +188,6 @@ func (n *Nylon) reconcileAdvertisedPrefixes(next *state.CentralCfg) { } } -func samePrefixHealthConfig(a, b state.PrefixHealthWrapper, tunables *state.RouterTunables) bool { - switch av := a.PrefixHealth.(type) { - case *state.StaticPrefixHealth: - bv, ok := b.PrefixHealth.(*state.StaticPrefixHealth) - return ok && av.Prefix == bv.Prefix && av.Metric == bv.Metric - case *state.PingPrefixHealth: - bv, ok := b.PrefixHealth.(*state.PingPrefixHealth) - return ok && - av.Prefix == bv.Prefix && - av.Addr == bv.Addr && - av.BindIf == bv.BindIf && - equalUint32Ptr(av.Metric, bv.Metric) && - prefixHealthDelay(av.Delay, tunables) == prefixHealthDelay(bv.Delay, tunables) && - prefixHealthMaxFailures(av.MaxFailures, tunables) == prefixHealthMaxFailures(bv.MaxFailures, tunables) - case *state.HTTPPrefixHealth: - bv, ok := b.PrefixHealth.(*state.HTTPPrefixHealth) - return ok && - av.Prefix == bv.Prefix && - av.URL == bv.URL && - equalUint32Ptr(av.Metric, bv.Metric) && - prefixHealthDelay(av.Delay, tunables) == prefixHealthDelay(bv.Delay, tunables) - default: - return false - } -} - -func equalUint32Ptr(a, b *uint32) bool { - if a == nil || b == nil { - return a == b - } - return *a == *b -} - -func prefixHealthDelay(value *time.Duration, tunables *state.RouterTunables) time.Duration { - if value != nil { - return *value - } - return tunables.HealthCheckDelay -} - -func prefixHealthMaxFailures(value *int, tunables *state.RouterTunables) int { - if value != nil { - return *value - } - return tunables.HealthCheckMaxFailures -} - func (n *Nylon) startAdvertisedPrefixHealth() { for _, ph := range n.GetNode(n.LocalCfg.Id).Prefixes { n.Log.Debug("starting prefix healthcheck", "prefix", ph.GetPrefix()) diff --git a/core/nylon_apply_test.go b/core/nylon_apply_test.go index 94a4d7b8..be7d1ae2 100644 --- a/core/nylon_apply_test.go +++ b/core/nylon_apply_test.go @@ -38,6 +38,33 @@ func TestReconcileAdvertisedPrefixesStartsChangedPrefixHealth(t *testing.T) { assert.Equal(t, state.INF, n.RouterState.Advertised[prefix].MetricFn()) } +func TestReconcileAdvertisedPrefixesStartsChangedPingPrefixHealth(t *testing.T) { + prefix := netip.MustParsePrefix("fd00::54/128") + oldPrefix := state.PrefixHealthWrapper{ + PrefixHealth: &state.StaticPrefixHealth{Prefix: prefix}, + } + n := testNylonWithPrefixes(oldPrefix) + n.RouterState.Advertised[prefix] = state.Advertisement{ + NodeId: n.LocalCfg.Id, + Expiry: maxConfigTime, + MetricFn: oldPrefix.GetMetric, + } + + delay := 100 * time.Millisecond + next := testCentralConfig(n.LocalCfg.Id, state.PrefixHealthWrapper{ + PrefixHealth: &state.PingPrefixHealth{ + Prefix: prefix, + Addr: netip.MustParseAddr("127.0.0.1"), + Delay: &delay, + }, + }) + + n.reconcileAdvertisedPrefixes(&next) + t.Cleanup(next.Routers[0].Prefixes[0].Stop) + + assert.Equal(t, state.INF, n.RouterState.Advertised[prefix].MetricFn()) +} + func TestReconcileAdvertisedPrefixesReusesUnchangedRunningPrefixHealth(t *testing.T) { prefix := netip.MustParsePrefix("fd00::53/128") delay := time.Millisecond diff --git a/state/prefix_health.go b/state/prefix_health.go index 11a70122..3a1f90d4 100644 --- a/state/prefix_health.go +++ b/state/prefix_health.go @@ -39,6 +39,11 @@ func (s *StaticPrefixHealth) Start(log *slog.Logger, t *RouterTunables) { // do nothing } +func (s *StaticPrefixHealth) sameConfig(other PrefixHealth, _ *RouterTunables) bool { + o, ok := other.(*StaticPrefixHealth) + return ok && s.Prefix == o.Prefix && s.Metric == o.Metric +} + type PingPrefixHealth struct { Prefix netip.Prefix `yaml:"prefix"` Addr netip.Addr `yaml:"addr"` // the address to ping @@ -46,7 +51,7 @@ type PingPrefixHealth struct { Delay *time.Duration `yaml:"delay,omitempty"` // delay between pings BindIf string `yaml:"bind_if,omitempty"` // local interface to bind to Metric *uint32 `yaml:"metric,omitempty"` // metric override - lastMetric uint32 + lastMetric atomic.Uint32 running atomic.Bool } @@ -81,7 +86,7 @@ func (p *PingPrefixHealth) GetMetric() uint32 { if p.Metric != nil { return *p.Metric } - return p.lastMetric + return p.lastMetric.Load() } func (p *PingPrefixHealth) GetPrefix() netip.Prefix { return p.Prefix @@ -96,11 +101,12 @@ func (p *PingPrefixHealth) Start(log *slog.Logger, t *RouterTunables) { if p.MaxFailures == nil { p.MaxFailures = &t.HealthCheckMaxFailures } + // Default to unreachable until the first successful probe. + p.lastMetric.Store(INF) go func() { ticker := time.NewTicker(*p.Delay) for p.running.Load() { time.Sleep(*p.Delay) - p.lastMetric = INF bind4 := "" bind6 := "" var err error @@ -133,25 +139,36 @@ func (p *PingPrefixHealth) Start(log *slog.Logger, t *RouterTunables) { rtt, err := pinger.PingAttempts(addr, time.Duration(int64(*p.Delay)/int64(*p.MaxFailures)), *p.MaxFailures) if err != nil { // failed - p.lastMetric = INF + p.lastMetric.Store(INF) log.Debug("prefix healthcheck failed", "prefix", p.Prefix.String(), "addr", p.Addr.String(), "error", err) pinger.Close() break // break to outer loop to recreate pinger } else { // success - p.lastMetric = DurationToMetric(rtt) + p.lastMetric.Store(DurationToMetric(rtt)) } } } }() } +func (p *PingPrefixHealth) sameConfig(other PrefixHealth, tunables *RouterTunables) bool { + o, ok := other.(*PingPrefixHealth) + return ok && + p.Prefix == o.Prefix && + p.Addr == o.Addr && + p.BindIf == o.BindIf && + sameOptionalUint32(p.Metric, o.Metric) && + prefixHealthDelay(p.Delay, tunables) == prefixHealthDelay(o.Delay, tunables) && + prefixHealthMaxFailures(p.MaxFailures, tunables) == prefixHealthMaxFailures(o.MaxFailures, tunables) +} + type HTTPPrefixHealth struct { Prefix netip.Prefix `yaml:"prefix"` URL string `yaml:"url"` // the URL to check Delay *time.Duration `yaml:"delay,omitempty"` // delay between probes Metric *uint32 `yaml:"metric,omitempty"` // metric override - lastMetric uint32 + lastMetric atomic.Uint32 running atomic.Bool } @@ -163,7 +180,7 @@ func (h *HTTPPrefixHealth) GetMetric() uint32 { if h.Metric != nil { return *h.Metric } - return h.lastMetric + return h.lastMetric.Load() } func (h *HTTPPrefixHealth) GetPrefix() netip.Prefix { return h.Prefix @@ -172,7 +189,7 @@ func (h *HTTPPrefixHealth) Start(log *slog.Logger, t *RouterTunables) { if h.running.Swap(true) { return } - h.lastMetric = INF + h.lastMetric.Store(INF) if h.Delay == nil { h.Delay = &t.HealthCheckDelay } @@ -186,21 +203,64 @@ func (h *HTTPPrefixHealth) Start(log *slog.Logger, t *RouterTunables) { resp, err := http.Get(h.URL) if err != nil || resp.StatusCode != http.StatusOK { // failed - h.lastMetric = INF + h.lastMetric.Store(INF) log.Debug("prefix healthcheck failed", "prefix", h.Prefix.String(), "url", h.URL, "error", err) } else { // success rtt := time.Since(startTime) - h.lastMetric = DurationToMetric(rtt) + h.lastMetric.Store(DurationToMetric(rtt)) } } }() } +func (h *HTTPPrefixHealth) sameConfig(other PrefixHealth, tunables *RouterTunables) bool { + o, ok := other.(*HTTPPrefixHealth) + return ok && + h.Prefix == o.Prefix && + h.URL == o.URL && + sameOptionalUint32(h.Metric, o.Metric) && + prefixHealthDelay(h.Delay, tunables) == prefixHealthDelay(o.Delay, tunables) +} + type PrefixHealthWrapper struct { PrefixHealth } +type prefixHealthConfig interface { + sameConfig(other PrefixHealth, tunables *RouterTunables) bool +} + +// SameConfig reports whether two prefix health checks have equivalent configuration. +func (p PrefixHealthWrapper) SameConfig(other PrefixHealthWrapper, tunables *RouterTunables) bool { + if p.PrefixHealth == nil || other.PrefixHealth == nil { + return p.PrefixHealth == other.PrefixHealth + } + config, ok := p.PrefixHealth.(prefixHealthConfig) + return ok && config.sameConfig(other.PrefixHealth, tunables) +} + +func sameOptionalUint32(a, b *uint32) bool { + if a == nil || b == nil { + return a == b + } + return *a == *b +} + +func prefixHealthDelay(value *time.Duration, tunables *RouterTunables) time.Duration { + if value != nil { + return *value + } + return tunables.HealthCheckDelay +} + +func prefixHealthMaxFailures(value *int, tunables *RouterTunables) int { + if value != nil { + return *value + } + return tunables.HealthCheckMaxFailures +} + func (p PrefixHealthWrapper) MarshalYAML() (interface{}, error) { switch v := p.PrefixHealth.(type) { case *StaticPrefixHealth: