From ead9745736ba161d8edc2d8904ad73d56db6ddf9 Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Mon, 18 May 2026 11:52:19 +0200
Subject: [PATCH 1/2] fix(runtime): fall back to provider_opts.context_size for
 compaction

Local models not catalogued in models.dev (e.g. DMR with HuggingFace
GGUFs) can now supply context_size via provider_opts to enable
compaction. When models.dev lookup fails, the runtime falls back to
this user-supplied limit, making compaction (proactive threshold and
post-overflow recovery) functional for uncatalogued models.

Fixes #2800
---
 pkg/runtime/compaction_context_limit_test.go | 162 +++++++++++++++++++
 pkg/runtime/loop.go                          |  19 ++-
 pkg/runtime/session_compaction.go            |  59 ++++++-
 3 files changed, 232 insertions(+), 8 deletions(-)
 create mode 100644 pkg/runtime/compaction_context_limit_test.go

diff --git a/pkg/runtime/compaction_context_limit_test.go b/pkg/runtime/compaction_context_limit_test.go
new file mode 100644
index 000000000..15b14c6e0
--- /dev/null
+++ b/pkg/runtime/compaction_context_limit_test.go
@@ -0,0 +1,162 @@
+package runtime
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/docker/docker-agent/pkg/agent"
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/model/provider/base"
+	"github.com/docker/docker-agent/pkg/modelsdev"
+	"github.com/docker/docker-agent/pkg/team"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// providerOptsProvider is a minimal provider used to test that
+// [providerContextLimit] reads the user-supplied context_size from
+// the resolved [latest.ModelConfig.ProviderOpts] map.
+type providerOptsProvider struct {
+	id   string
+	opts map[string]any
+}
+
+func (p *providerOptsProvider) ID() modelsdev.ID { return modelsdev.ParseIDOrZero(p.id) }
+
+func (p *providerOptsProvider) CreateChatCompletionStream(context.Context, []chat.Message, []tools.Tool) (chat.MessageStream, error) {
+	return &mockStream{}, nil
+}
+
+func (p *providerOptsProvider) BaseConfig() base.Config {
+	return base.Config{
+		ModelConfig: latest.ModelConfig{ProviderOpts: p.opts},
+	}
+}
+
+func (p *providerOptsProvider) MaxTokens() int { return 0 }
+
+// TestProviderContextLimit covers the fallback that lets compaction
+// trigger for local models that aren't catalogued in models.dev. The
+// helper accepts the various scalar shapes that YAML/JSON decoders
+// produce ("32768", 32768, 32768.0) and rejects junk.
+func TestProviderContextLimit(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		opts map[string]any
+		want int64
+	}{
+		{name: "nil opts", opts: nil, want: 0},
+		{name: "empty opts", opts: map[string]any{}, want: 0},
+		{name: "missing key", opts: map[string]any{"other": 123}, want: 0},
+		{name: "int", opts: map[string]any{"context_size": 32768}, want: 32768},
+		{name: "int64", opts: map[string]any{"context_size": int64(65536)}, want: 65536},
+		{name: "float64 (json)", opts: map[string]any{"context_size": float64(8192)}, want: 8192},
+		{name: "string decimal", opts: map[string]any{"context_size": "16384"}, want: 16384},
+		{name: "string with whitespace", opts: map[string]any{"context_size": "  4096 "}, want: 4096},
+		{name: "non-numeric string", opts: map[string]any{"context_size": "lots"}, want: 0},
+		{name: "negative", opts: map[string]any{"context_size": -1}, want: 0},
+		{name: "zero", opts: map[string]any{"context_size": 0}, want: 0},
+		{name: "bool", opts: map[string]any{"context_size": true}, want: 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			p := &providerOptsProvider{id: "dmr/test-model", opts: tt.opts}
+			assert.Equal(t, tt.want, providerContextLimit(p))
+		})
+	}
+}
+
+// TestProviderContextLimit_NilProvider verifies the helper handles a
+// nil provider safely (returns 0). Belt-and-braces for callers that
+// can't statically prove non-nil.
+func TestProviderContextLimit_NilProvider(t *testing.T) {
+	t.Parallel()
+	assert.Equal(t, int64(0), providerContextLimit(nil))
+}
+
+// errorModelStore returns a "not found" error from GetModel, simulating
+// a models.dev catalogue that doesn't have an entry for the configured
+// model (the exact case reported for DMR + HuggingFace GGUF models).
+type errorModelStore struct {
+	ModelStore
+
+	err error
+}
+
+func (s errorModelStore) GetModel(_ context.Context, _ modelsdev.ID) (*modelsdev.Model, error) {
+	return nil, s.err
+}
+
+// TestCompactionContextLimit_FallsBackToProviderOpts verifies that the
+// runtime resolves a usable context limit from provider_opts.context_size
+// when the models.dev catalogue lookup fails.
+//
+// This is the core of the fix for the reported bug: DMR users with a
+// model not catalogued in models.dev (e.g. a HuggingFace GGUF) could
+// supply context_size via provider_opts but compaction silently became
+// a no-op, eventually surfacing as "Failed to get model definition"
+// when overflow recovery was attempted.
+func TestCompactionContextLimit_FallsBackToProviderOpts(t *testing.T) {
+	t.Parallel()
+
+	prov := &providerOptsProvider{
+		id:   "dmr/hf.co/unsloth/qwen3-4b-gguf:Q4_K_M",
+		opts: map[string]any{"context_size": 32768},
+	}
+	root := agent.New("root", "test", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm, WithModelStore(errorModelStore{err: errors.New("not in catalogue")}))
+	require.NoError(t, err)
+
+	got := rt.compactionContextLimit(t.Context(), root)
+	assert.Equal(t, int64(32768), got,
+		"context limit must fall back to provider_opts.context_size when models.dev has no entry")
+}
+
+// TestCompactionContextLimit_PrefersModelsDev verifies that a present
+// models.dev limit wins over provider_opts.context_size. This keeps
+// existing behaviour intact for catalogued models.
+func TestCompactionContextLimit_PrefersModelsDev(t *testing.T) {
+	t.Parallel()
+
+	prov := &providerOptsProvider{
+		id:   "openai/gpt-5",
+		opts: map[string]any{"context_size": 1}, // user can't lie us into a tiny limit
+	}
+	root := agent.New("root", "test", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm, WithModelStore(mockModelStoreWithLimit{limit: 200_000}))
+	require.NoError(t, err)
+
+	got := rt.compactionContextLimit(t.Context(), root)
+	assert.Equal(t, int64(200_000), got,
+		"models.dev limit must take precedence over provider_opts.context_size")
+}
+
+// TestCompactionContextLimit_NoSourcesYieldsZero verifies the legacy
+// behaviour: when neither models.dev nor provider_opts provides a
+// limit, the function returns 0 (callers treat this as "can't
+// compact"; the LLM strategy enforces ContextLimit > 0).
+func TestCompactionContextLimit_NoSourcesYieldsZero(t *testing.T) {
+	t.Parallel()
+
+	prov := &providerOptsProvider{id: "unknown/model"} // no opts
+	root := agent.New("root", "test", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm, WithModelStore(mockModelStore{}))
+	require.NoError(t, err)
+
+	got := rt.compactionContextLimit(t.Context(), root)
+	assert.Equal(t, int64(0), got)
+}
diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index a0a441970..99b576f77 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -362,14 +362,20 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 		if err != nil {
 			slog.DebugContext(ctx, "Failed to get model definition", "error", err)
 		}
-		// We can only compact if we know the limit.
+		// We can only compact if we know the limit. Prefer the
+		// models.dev catalogue; fall back to the user-supplied
+		// provider_opts.context_size so local models that aren't
+		// catalogued (e.g. DMR with HuggingFace GGUFs) still benefit
+		// from automatic compaction.
 		var contextLimit int64
 		if m != nil {
 			contextLimit = int64(m.Limit.Context)
-
-			if r.sessionCompaction && compaction.ShouldCompact(sess.InputTokens, sess.OutputTokens, 0, contextLimit) {
-				r.compactWithReason(ctx, sess, "", compactionReasonThreshold, sink)
-			}
+		}
+		if contextLimit <= 0 {
+			contextLimit = providerContextLimit(model)
+		}
+		if contextLimit > 0 && r.sessionCompaction && compaction.ShouldCompact(sess.InputTokens, sess.OutputTokens, 0, contextLimit) {
+			r.compactWithReason(ctx, sess, "", compactionReasonThreshold, sink)
 		}
 
 		// Drain steer messages queued while idle or before the first model call
@@ -779,7 +785,8 @@ func (r *LocalRuntime) compactIfNeeded(
 	messageCountBefore int,
 	events EventSink,
 ) {
-	if m == nil || !r.sessionCompaction || contextLimit <= 0 {
+	_ = m // models.dev definition isn't required: contextLimit may have been derived from provider_opts.context_size when the model is missing from the catalogue.
+	if !r.sessionCompaction || contextLimit <= 0 {
 		return
 	}
 
diff --git a/pkg/runtime/session_compaction.go b/pkg/runtime/session_compaction.go
index 6bc557c00..4be0554f4 100644
--- a/pkg/runtime/session_compaction.go
+++ b/pkg/runtime/session_compaction.go
@@ -3,6 +3,8 @@ package runtime
 import (
 	"context"
 	"log/slog"
+	"strconv"
+	"strings"
 
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/chat"
@@ -162,6 +164,13 @@ func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) *
 // when it can't be resolved. Failure is non-fatal: a before_compaction
 // hook may supply its own summary and never need the model definition.
 // The LLM strategy itself enforces ContextLimit > 0.
+//
+// When the modelsdev definition is unavailable (e.g. a Docker Model
+// Runner model that isn't catalogued, like a HuggingFace GGUF), the
+// limit falls back to the user-supplied [provider_opts.context_size].
+// This mirrors what DMR itself uses to size the inference context, so
+// compaction triggers (proactive 90% threshold and post-overflow
+// recovery) work for local models without a models.dev entry.
 func (r *LocalRuntime) compactionContextLimit(ctx context.Context, a *agent.Agent) int64 {
 	if a == nil || a.Model(ctx) == nil {
 		return 0
@@ -171,10 +180,56 @@ func (r *LocalRuntime) compactionContextLimit(ctx context.Context, a *agent.Agen
 		options.WithMaxTokens(compactor.MaxSummaryTokens),
 	)
 	m, err := r.modelsStore.GetModel(ctx, summaryModel.ID())
-	if err != nil || m == nil {
+	if err == nil && m != nil && m.Limit.Context > 0 {
+		return int64(m.Limit.Context)
+	}
+	return providerContextLimit(summaryModel)
+}
+
+// providerContextLimit reads [provider_opts.context_size] from a
+// provider's resolved [latest.ModelConfig], returning 0 when unset or
+// not parseable as an integer. This is the fallback used when the
+// models.dev catalogue does not have an entry for the configured
+// model (typically Docker Model Runner with a HuggingFace GGUF model).
+//
+// Accepted shapes mirror what YAML/JSON decoders may produce: int,
+// int64, float64, and decimal strings. Negative or zero values are
+// treated as "unset" so callers don't accidentally trigger
+// compaction with a degenerate limit.
+func providerContextLimit(p provider.Provider) int64 {
+	if p == nil {
+		return 0
+	}
+	opts := p.BaseConfig().ModelConfig.ProviderOpts
+	v, ok := opts["context_size"]
+	if !ok {
+		return 0
+	}
+	var n int64
+	switch t := v.(type) {
+	case int64:
+		n = t
+	case int:
+		n = int64(t)
+	case int32:
+		n = int64(t)
+	case float64:
+		n = int64(t)
+	case float32:
+		n = int64(t)
+	case string:
+		parsed, err := strconv.ParseInt(strings.TrimSpace(t), 10, 64)
+		if err != nil {
+			return 0
+		}
+		n = parsed
+	default:
+		return 0
+	}
+	if n <= 0 {
 		return 0
 	}
-	return int64(m.Limit.Context)
+	return n
 }
 
 // runCompactionAgent runs an agent against a sub-session for compaction.

From 8dba51f42430eb964141a0929c5a37fa90a79825 Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Mon, 18 May 2026 12:00:13 +0200
Subject: [PATCH 2/2] refactor(runtime): centralise context-limit resolution

Self-review of the previous commit surfaced four issues:

  * compactIfNeeded carried an unused *modelsdev.Model parameter; drop it
    and let the call sites pass the resolved contextLimit only.
  * EmitStartupInfo and compactWithReason did their own catalogue-only
    lookup, so the sidebar's context-percent and the post-compaction
    TokenUsageEvent stayed inconsistent with the freshly-fixed compaction
    triggers in loop.go and session_compaction.go.
  * The provider_opts.context_size fallback was second-class. The user
    typed that number in their config, and DMR allocates exactly that
    much; treat it as authoritative when set, with the catalogue as
    fallback. This also makes the resolution monotonic across providers
    rather than depending on whether the catalogue has the model.
  * The dual implementation of priority order (catalogue-first in
    runStreamLoop, provider-first elsewhere) was a footgun.

Extract resolveContextLimit on LocalRuntime as the single source of
truth. compactionContextLimit, runStreamLoop, EmitStartupInfo and
compactWithReason now route through it, so the sidebar, the proactive
trigger and the LLM compactor all plan against the same number.
---
 pkg/runtime/compaction_context_limit_test.go | 34 ++++++++++++++++----
 pkg/runtime/loop.go                          | 33 ++++++++-----------
 pkg/runtime/runtime.go                       | 10 ++----
 pkg/runtime/session_compaction.go            | 32 +++++++++++++-----
 4 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/pkg/runtime/compaction_context_limit_test.go b/pkg/runtime/compaction_context_limit_test.go
index 15b14c6e0..60e478138 100644
--- a/pkg/runtime/compaction_context_limit_test.go
+++ b/pkg/runtime/compaction_context_limit_test.go
@@ -122,15 +122,18 @@ func TestCompactionContextLimit_FallsBackToProviderOpts(t *testing.T) {
 		"context limit must fall back to provider_opts.context_size when models.dev has no entry")
 }
 
-// TestCompactionContextLimit_PrefersModelsDev verifies that a present
-// models.dev limit wins over provider_opts.context_size. This keeps
-// existing behaviour intact for catalogued models.
-func TestCompactionContextLimit_PrefersModelsDev(t *testing.T) {
+// TestCompactionContextLimit_PrefersProviderOpts verifies that an explicit
+// user-supplied provider_opts.context_size is the authoritative limit, even
+// when the models.dev catalogue has its own entry. This is what the user is
+// asking for — DMR allocates exactly context_size bytes for the inference
+// context, and a user setting a smaller-than-catalogue value (cost / memory
+// tuning) wants compaction to respect that.
+func TestCompactionContextLimit_PrefersProviderOpts(t *testing.T) {
 	t.Parallel()
 
 	prov := &providerOptsProvider{
 		id:   "openai/gpt-5",
-		opts: map[string]any{"context_size": 1}, // user can't lie us into a tiny limit
+		opts: map[string]any{"context_size": 8192},
 	}
 	root := agent.New("root", "test", agent.WithModel(prov))
 	tm := team.New(team.WithAgents(root))
@@ -139,8 +142,25 @@ func TestCompactionContextLimit_PrefersModelsDev(t *testing.T) {
 	require.NoError(t, err)
 
 	got := rt.compactionContextLimit(t.Context(), root)
-	assert.Equal(t, int64(200_000), got,
-		"models.dev limit must take precedence over provider_opts.context_size")
+	assert.Equal(t, int64(8192), got,
+		"explicit provider_opts.context_size must take precedence over the catalogue")
+}
+
+// TestCompactionContextLimit_FallsBackToCatalogue verifies that when the
+// user has not supplied context_size, the runtime uses the models.dev
+// catalogue limit. This is the path most hosted-model users hit.
+func TestCompactionContextLimit_FallsBackToCatalogue(t *testing.T) {
+	t.Parallel()
+
+	prov := &providerOptsProvider{id: "openai/gpt-5"} // no opts
+	root := agent.New("root", "test", agent.WithModel(prov))
+	tm := team.New(team.WithAgents(root))
+
+	rt, err := NewLocalRuntime(tm, WithModelStore(mockModelStoreWithLimit{limit: 200_000}))
+	require.NoError(t, err)
+
+	got := rt.compactionContextLimit(t.Context(), root)
+	assert.Equal(t, int64(200_000), got)
 }
 
 // TestCompactionContextLimit_NoSourcesYieldsZero verifies the legacy
diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index 99b576f77..a028d8821 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -362,18 +362,15 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 		if err != nil {
 			slog.DebugContext(ctx, "Failed to get model definition", "error", err)
 		}
-		// We can only compact if we know the limit. Prefer the
-		// models.dev catalogue; fall back to the user-supplied
-		// provider_opts.context_size so local models that aren't
-		// catalogued (e.g. DMR with HuggingFace GGUFs) still benefit
-		// from automatic compaction.
-		var contextLimit int64
-		if m != nil {
-			contextLimit = int64(m.Limit.Context)
-		}
-		if contextLimit <= 0 {
-			contextLimit = providerContextLimit(model)
-		}
+		// We can only compact if we know the context limit.
+		// resolveContextLimit prefers provider_opts.context_size when set
+		// (some providers — notably Docker Model Runner — use it to size
+		// the actual inference context), then falls back to the models.dev
+		// catalogue. The lookup above is reused inside resolveContextLimit
+		// only when context_size isn't supplied; we keep the explicit call
+		// here because m is also threaded into [recordAssistantMessage] for
+		// per-message cost computation.
+		contextLimit := r.resolveContextLimit(ctx, model, modelID)
 		if contextLimit > 0 && r.sessionCompaction && compaction.ShouldCompact(sess.InputTokens, sess.OutputTokens, 0, contextLimit) {
 			r.compactWithReason(ctx, sess, "", compactionReasonThreshold, sink)
 		}
@@ -381,7 +378,7 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 		// Drain steer messages queued while idle or before the first model call
 		// (covers idle-window and first-turn-miss races).
 		if drained, messageCountBeforeSteer := r.drainAndEmitSteered(ctx, sess, sink); drained {
-			r.compactIfNeeded(ctx, sess, a, m, contextLimit, messageCountBeforeSteer, sink)
+			r.compactIfNeeded(ctx, sess, a, contextLimit, messageCountBeforeSteer, sink)
 		}
 
 		// Everything from turn_start onwards is wrapped in a closure so a
@@ -646,7 +643,7 @@ func (r *LocalRuntime) runTurn(
 
 	// Drain steer messages that arrived during tool calls.
 	if drained, _ := r.drainAndEmitSteered(ctx, sess, events); drained {
-		r.compactIfNeeded(ctx, sess, a, m, contextLimit, messageCountBeforeTools, events)
+		r.compactIfNeeded(ctx, sess, a, contextLimit, messageCountBeforeTools, events)
 		endReason = turnEndReasonSteered
 		return turnContinue
 	}
@@ -657,7 +654,7 @@ func (r *LocalRuntime) runTurn(
 
 		// Re-check steer queue: closes the race between the mid-loop drain and this stop.
 		if drained, _ := r.drainAndEmitSteered(ctx, sess, events); drained {
-			r.compactIfNeeded(ctx, sess, a, m, contextLimit, messageCountBeforeTools, events)
+			r.compactIfNeeded(ctx, sess, a, contextLimit, messageCountBeforeTools, events)
 			endReason = turnEndReasonSteered
 			return turnContinue
 		}
@@ -672,7 +669,7 @@ func (r *LocalRuntime) runTurn(
 			userMsg := session.UserMessage(followUp.Content, followUp.MultiContent...)
 			sess.AddMessage(userMsg)
 			events.Emit(UserMessage(followUp.Content, sess.ID, followUp.MultiContent, len(sess.Messages)-1))
-			r.compactIfNeeded(ctx, sess, a, m, contextLimit, messageCountBeforeTools, events)
+			r.compactIfNeeded(ctx, sess, a, contextLimit, messageCountBeforeTools, events)
 			endReason = turnEndReasonContinue
 			return turnContinue // re-enter the loop for a new turn
 		}
@@ -681,7 +678,7 @@ func (r *LocalRuntime) runTurn(
 		return turnExit
 	}
 
-	r.compactIfNeeded(ctx, sess, a, m, contextLimit, messageCountBeforeTools, events)
+	r.compactIfNeeded(ctx, sess, a, contextLimit, messageCountBeforeTools, events)
 	endReason = turnEndReasonContinue
 	return turnContinue
 }
@@ -780,12 +777,10 @@ func (r *LocalRuntime) compactIfNeeded(
 	ctx context.Context,
 	sess *session.Session,
 	a *agent.Agent,
-	m *modelsdev.Model,
 	contextLimit int64,
 	messageCountBefore int,
 	events EventSink,
 ) {
-	_ = m // models.dev definition isn't required: contextLimit may have been derived from provider_opts.context_size when the model is missing from the catalogue.
 	if !r.sessionCompaction || contextLimit <= 0 {
 		return
 	}
diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go
index 7e9fccdd8..2c77e6424 100644
--- a/pkg/runtime/runtime.go
+++ b/pkg/runtime/runtime.go
@@ -1029,10 +1029,7 @@ func (r *LocalRuntime) EmitStartupInfo(ctx context.Context, sess *session.Sessio
 	// sub-sessions won't emit their own events, so the parent must include
 	// their costs.
 	if sess != nil && (sess.InputTokens > 0 || sess.OutputTokens > 0) {
-		var contextLimit int64
-		if m, err := r.modelsStore.GetModel(ctx, modelID); err == nil && m != nil {
-			contextLimit = int64(m.Limit.Context)
-		}
+		contextLimit := r.resolveContextLimit(ctx, a.Model(ctx), modelID)
 		usage := SessionUsage(sess, contextLimit)
 		usage.Cost = sess.TotalCost()
 
@@ -1301,10 +1298,7 @@ func (r *LocalRuntime) compactWithReason(ctx context.Context, sess *session.Sess
 	// compaction: tokens drop to the summary size, context % drops, and
 	// cost increases by the summary generation cost.
 	modelID := r.getEffectiveModelID(a)
-	var contextLimit int64
-	if m, err := r.modelsStore.GetModel(ctx, modelID); err == nil && m != nil {
-		contextLimit = int64(m.Limit.Context)
-	}
+	contextLimit := r.resolveContextLimit(ctx, a.Model(ctx), modelID)
 	events.Emit(NewTokenUsageEvent(sess.ID, a.Name(), SessionUsage(sess, contextLimit)))
 }
 
diff --git a/pkg/runtime/session_compaction.go b/pkg/runtime/session_compaction.go
index 4be0554f4..6132c2d5d 100644
--- a/pkg/runtime/session_compaction.go
+++ b/pkg/runtime/session_compaction.go
@@ -12,6 +12,7 @@ import (
 	"github.com/docker/docker-agent/pkg/hooks"
 	"github.com/docker/docker-agent/pkg/model/provider"
 	"github.com/docker/docker-agent/pkg/model/provider/options"
+	"github.com/docker/docker-agent/pkg/modelsdev"
 	"github.com/docker/docker-agent/pkg/runtime/compactor"
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/team"
@@ -165,12 +166,9 @@ func summaryFromHook(sess *session.Session, a *agent.Agent, pre *hooks.Result) *
 // hook may supply its own summary and never need the model definition.
 // The LLM strategy itself enforces ContextLimit > 0.
 //
-// When the modelsdev definition is unavailable (e.g. a Docker Model
-// Runner model that isn't catalogued, like a HuggingFace GGUF), the
-// limit falls back to the user-supplied [provider_opts.context_size].
-// This mirrors what DMR itself uses to size the inference context, so
-// compaction triggers (proactive 90% threshold and post-overflow
-// recovery) work for local models without a models.dev entry.
+// See [LocalRuntime.resolveContextLimit] for the resolution order; we
+// pass the cloned summary-call provider so its provider_opts (which
+// match the underlying model) are considered.
 func (r *LocalRuntime) compactionContextLimit(ctx context.Context, a *agent.Agent) int64 {
 	if a == nil || a.Model(ctx) == nil {
 		return 0
@@ -179,11 +177,29 @@ func (r *LocalRuntime) compactionContextLimit(ctx context.Context, a *agent.Agen
 		options.WithStructuredOutput(nil),
 		options.WithMaxTokens(compactor.MaxSummaryTokens),
 	)
-	m, err := r.modelsStore.GetModel(ctx, summaryModel.ID())
+	return r.resolveContextLimit(ctx, summaryModel, summaryModel.ID())
+}
+
+// resolveContextLimit resolves the effective context window size for a
+// model. Resolution order:
+//
+//  1. The user-supplied [provider_opts.context_size], when set, is
+//     authoritative. Some providers (notably Docker Model Runner) use
+//     it to size the actual inference context, so we plan against the
+//     same number the engine will enforce. This also makes compaction
+//     work for local models that aren't catalogued in models.dev (e.g.
+//     a HuggingFace GGUF).
+//  2. Otherwise, the models.dev catalogue limit looked up by id.
+//  3. Otherwise, 0 (caller treats this as "can't compact").
+func (r *LocalRuntime) resolveContextLimit(ctx context.Context, p provider.Provider, id modelsdev.ID) int64 {
+	if n := providerContextLimit(p); n > 0 {
+		return n
+	}
+	m, err := r.modelsStore.GetModel(ctx, id)
 	if err == nil && m != nil && m.Limit.Context > 0 {
 		return int64(m.Limit.Context)
 	}
-	return providerContextLimit(summaryModel)
+	return 0
 }
 
 // providerContextLimit reads [provider_opts.context_size] from a