Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[receiver/prometheusremotewritereceiver] Fix silent data loss on cons…
…umer failure

The receiver was sending HTTP 204 No Content before calling ConsumeMetrics(),
so if the consumer failed, clients incorrectly thought data was delivered.
This violates the Prometheus Remote Write spec which states receivers MUST NOT
return 2xx if data was not successfully written.

Changes:
- Move WriteHeader(204) to after ConsumeMetrics() succeeds
- Return 400 Bad Request for permanent consumer errors
- Return 500 Internal Server Error for retryable errors
- Add tests for consumer error handling

Signed-off-by: Arve Knudsen <arve.knudsen@gmail.com>
  • Loading branch information
aknuds1 committed Jan 2, 2026
commit ec15cd2fdfa974a6c5cef001279d9b614f85a686
31 changes: 31 additions & 0 deletions .chloggen/prometheusremotewrite-fix-silent-data-loss.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
component: receiver/prometheusremotewrite

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix silent data loss when consumer fails by returning appropriate HTTP error codes instead of 204 No Content.

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [45151]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
The receiver was sending HTTP 204 No Content before calling ConsumeMetrics(),
causing clients to believe data was successfully delivered even when the consumer failed.
Now returns 400 Bad Request for permanent errors and 500 Internal Server Error for retryable errors,
as per the Prometheus Remote Write 2.0 specification.

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
2 changes: 1 addition & 1 deletion receiver/prometheusremotewritereceiver/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
go.opentelemetry.io/collector/config/confighttp v0.142.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/confmap v1.48.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/consumer v1.48.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/consumer/consumererror v0.142.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/consumer/consumertest v0.142.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/pdata v1.48.1-0.20251223191316-a9aaa99a1214
go.opentelemetry.io/collector/receiver v1.48.1-0.20251223191316-a9aaa99a1214
Expand Down Expand Up @@ -103,7 +104,6 @@ require (
go.opentelemetry.io/collector/config/configoptional v1.48.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/config/configtls v1.48.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/confmap/xconfmap v0.142.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/consumer/consumererror v0.142.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/consumer/xconsumer v0.142.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/extension/extensionauth v1.48.1-0.20251223191316-a9aaa99a1214 // indirect
go.opentelemetry.io/collector/extension/extensionmiddleware v0.142.1-0.20251223191316-a9aaa99a1214 // indirect
Expand Down
17 changes: 13 additions & 4 deletions receiver/prometheusremotewritereceiver/receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/component/componentstatus"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/consumer/consumererror"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
"go.opentelemetry.io/collector/receiver"
Expand Down Expand Up @@ -193,18 +194,26 @@ func (prw *prometheusRemoteWriteReceiver) handlePRW(w http.ResponseWriter, req *
return
}

w.WriteHeader(http.StatusNoContent)

// Return if metric count is 0.
// Return early if metric count is 0.
if m.MetricCount() == 0 {
w.WriteHeader(http.StatusNoContent)
return
}

obsrecvCtx := prw.obsrecv.StartMetricsOp(req.Context())
err = prw.nextConsumer.ConsumeMetrics(req.Context(), m)
prw.obsrecv.EndMetricsOp(obsrecvCtx, "prometheusremotewritereceiver", m.ResourceMetrics().Len(), err)
if err != nil {
prw.settings.Logger.Error("Error consuming metrics", zapcore.Field{Key: "error", Type: zapcore.ErrorType, Interface: err})
if consumererror.IsPermanent(err) {
http.Error(w, err.Error(), http.StatusBadRequest)
} else {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
return
}
prw.obsrecv.EndMetricsOp(obsrecvCtx, "prometheusremotewritereceiver", m.ResourceMetrics().Len(), err)

w.WriteHeader(http.StatusNoContent)
}

// parseProto parses the content-type header and returns the version of the remote-write protocol.
Expand Down
100 changes: 100 additions & 0 deletions receiver/prometheusremotewritereceiver/receiver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package prometheusremotewritereceiver // import "github.com/open-telemetry/opent
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"math"
Expand All @@ -23,8 +24,10 @@ import (
writev2 "github.com/prometheus/prometheus/prompb/io/prometheus/write/v2"
"github.com/prometheus/prometheus/storage/remote"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/consumer/consumererror"
"go.opentelemetry.io/collector/consumer/consumertest"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
Expand Down Expand Up @@ -2030,3 +2033,100 @@ func TestConcurrentRequestsforSameResourceAttributes(t *testing.T) {
}
}
}

// setupMetricsReceiverWithConsumer creates a receiver with a custom consumer for testing.
func setupMetricsReceiverWithConsumer(t *testing.T, nextConsumer consumer.Metrics) *prometheusRemoteWriteReceiver {
t.Helper()

factory := NewFactory()
cfg := factory.CreateDefaultConfig()

prwReceiver, err := factory.CreateMetrics(t.Context(), receivertest.NewNopSettings(metadata.Type), cfg, nextConsumer)
require.NoError(t, err)
require.NotNil(t, prwReceiver, "metrics receiver creation failed")

receiverID := component.MustNewID("test")
obsrecv, err := receiverhelper.NewObsReport(receiverhelper.ObsReportSettings{
ReceiverID: receiverID,
Transport: "http",
ReceiverCreateSettings: receivertest.NewNopSettings(metadata.Type),
})
require.NoError(t, err)

prwReceiver.(*prometheusRemoteWriteReceiver).obsrecv = obsrecv
writeReceiver := prwReceiver.(*prometheusRemoteWriteReceiver)
t.Cleanup(func() {
writeReceiver.rmCache.Purge()
})

return writeReceiver
}

func TestHandlePRWConsumerResponse(t *testing.T) {
// Create a valid request with metrics.
request := &writev2.Request{
Symbols: []string{"", "__name__", "test_metric", "job", "test-job", "instance", "test-instance"},
Timeseries: []writev2.TimeSeries{
{
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{1, 2, 3, 4, 5, 6},
Samples: []writev2.Sample{{Value: 1, Timestamp: 1}},
},
},
}

pBuf := proto.NewBuffer(nil)
err := pBuf.Marshal(request)
require.NoError(t, err)

// Send raw protobuf body - in production the confighttp middleware decompresses
// but in tests we call handlePRW directly without middleware.
rawBody := pBuf.Bytes()

t.Run("success returns 204", func(t *testing.T) {
sink := &consumertest.MetricsSink{}
prwReceiver := setupMetricsReceiverWithConsumer(t, sink)

req := httptest.NewRequest(http.MethodPost, "/api/v1/write", bytes.NewBuffer(rawBody))
req.Header.Set("Content-Type", fmt.Sprintf("application/x-protobuf;proto=%s", remoteapi.WriteV2MessageType))

w := httptest.NewRecorder()
prwReceiver.handlePRW(w, req)
resp := w.Result()

assert.Equal(t, http.StatusNoContent, resp.StatusCode)
assert.Len(t, sink.AllMetrics(), 1)
})

t.Run("retryable error returns 500", func(t *testing.T) {
prwReceiver := setupMetricsReceiverWithConsumer(t, consumertest.NewErr(errors.New("temporary failure")))

req := httptest.NewRequest(http.MethodPost, "/api/v1/write", bytes.NewBuffer(rawBody))
req.Header.Set("Content-Type", fmt.Sprintf("application/x-protobuf;proto=%s", remoteapi.WriteV2MessageType))

w := httptest.NewRecorder()
prwReceiver.handlePRW(w, req)
resp := w.Result()

assert.Equal(t, http.StatusInternalServerError, resp.StatusCode)
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
assert.Contains(t, string(body), "temporary failure")
})

t.Run("permanent error returns 400", func(t *testing.T) {
prwReceiver := setupMetricsReceiverWithConsumer(t, consumertest.NewErr(consumererror.NewPermanent(errors.New("permanent failure"))))

req := httptest.NewRequest(http.MethodPost, "/api/v1/write", bytes.NewBuffer(rawBody))
req.Header.Set("Content-Type", fmt.Sprintf("application/x-protobuf;proto=%s", remoteapi.WriteV2MessageType))

w := httptest.NewRecorder()
prwReceiver.handlePRW(w, req)
resp := w.Result()

assert.Equal(t, http.StatusBadRequest, resp.StatusCode)
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
assert.Contains(t, string(body), "permanent failure")
})
}