Skip to content

Commit bca6826

Browse files
Ingester: fix wrong conversion of global into local limits (grafana#4238)
* Squash 7 commits Ingester: fix wrong conversion of global into local limits Upgrade CHANGELOG.md and add TODO in go.mod as a reminder to set the right version of dskit Implementing review findings Implementing review findings Implementing review findings Implementing review findings Upgraded to the latest dskit * Implementing review findings * Fixing a failing test * Improve precision of local limit calculation
1 parent 019ff2a commit bca6826

File tree

8 files changed

+223
-70
lines changed

8 files changed

+223
-70
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Querying with using `{__mimir_storage__="ephemeral"}` selector no longer works.
5151
* [BUGFIX] Fix JSON and YAML marshalling of `ephemeral_series_matchers` field in `/runtime_config`. #4091
5252
* [BUGFIX] Querier: track canceled requests with status code `499` in the metrics instead of `503` or `422`. #4099
5353
* [BUGFIX] Ingester: compact out-of-order data during `/ingester/flush` or when TSDB is idle. #4180
54+
* [BUGFIX] Ingester: conversion of global limits `max-series-per-user`, `max-series-per-metric`, `max-metadata-per-user` and `max-metadata-per-metric` into corresponding local limits now takes into account the number of ingesters in each zone. #4238
5455

5556
### Mixin
5657

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ require (
1616
github.com/golang/snappy v0.0.4
1717
github.com/google/gopacket v1.1.19
1818
github.com/gorilla/mux v1.8.0
19-
github.com/grafana/dskit v0.0.0-20230221092330-02c947faa1a0
19+
github.com/grafana/dskit v0.0.0-20230221142742-fcb3d68da3ed
2020
github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71
2121
github.com/hashicorp/golang-lru v0.6.0
2222
github.com/json-iterator/go v1.1.12

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -495,8 +495,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4=
495495
github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0=
496496
github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9 h1:LQAhgcUPnzdjU/OjCJaLlPQI7NmQCRlfjMPSA1VegvA=
497497
github.com/grafana-tools/sdk v0.0.0-20211220201350-966b3088eec9/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4=
498-
github.com/grafana/dskit v0.0.0-20230221092330-02c947faa1a0 h1:zHD4rspd2wyAlVnkMe/FXilYhJEW2HewUfiB+fRG7Ok=
499-
github.com/grafana/dskit v0.0.0-20230221092330-02c947faa1a0/go.mod h1:cZg7Iqx0LcH7n2WU7SZzMWX6Hu3eXSFFS/EmmaVgCOY=
498+
github.com/grafana/dskit v0.0.0-20230221142742-fcb3d68da3ed h1:vKJzoZMnG0bfvmZXXulZR6J7HsE0MRuCa/X86B4mRt8=
499+
github.com/grafana/dskit v0.0.0-20230221142742-fcb3d68da3ed/go.mod h1:cZg7Iqx0LcH7n2WU7SZzMWX6Hu3eXSFFS/EmmaVgCOY=
500500
github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71 h1:sAMF3CGAtlWLimTyvsO46Slwu1p6Fm5EMB64XzG5btI=
501501
github.com/grafana/e2e v0.1.1-0.20221115065638-fe4609fcbc71/go.mod h1:3UsooRp7yW5/NJQBlXcTsAHOoykEhNUYXkQ3r6ehEEY=
502502
github.com/grafana/gomemcache v0.0.0-20230221082510-6cde04bf2270 h1:cj3uiNKskh+/7QQOr3Lzdf40hmtpdlCRlYVdsV0xBWA=

pkg/ingester/limiter.go

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ var (
2828
// RingCount is the interface exposed by a ring implementation which allows
2929
// to count members
3030
type RingCount interface {
31-
HealthyInstancesCount() int
31+
InstancesCount() int
32+
InstancesInZoneCount() int
3233
ZonesCount() int
3334
}
3435

@@ -184,34 +185,43 @@ func (l *Limiter) convertGlobalToLocalLimit(userID string, globalLimit int) int
184185
return 0
185186
}
186187

187-
// Given we don't need a super accurate count (ie. when the ingesters
188-
// topology changes) and we prefer to always be in favor of the tenant,
189-
// we can use a per-ingester limit equal to:
190-
// (global limit / number of ingesters) * replication factor
191-
numIngesters := l.ring.HealthyInstancesCount()
192-
193-
// May happen because the number of ingesters is asynchronously updated.
194-
// If happens, we just temporarily ignore the global limit.
195-
if numIngesters == 0 {
196-
return 0
188+
zonesCount := l.getZonesCount()
189+
var ingestersInZoneCount int
190+
if zonesCount > 1 {
191+
// In this case zone-aware replication is enabled, and ingestersInZoneCount is initially set to
192+
// the total number of ingesters in the corresponding zone
193+
ingestersInZoneCount = l.ring.InstancesInZoneCount()
194+
} else {
195+
// In this case zone-aware replication is disabled, and ingestersInZoneCount is initially set to
196+
// the total number of ingesters
197+
ingestersInZoneCount = l.ring.InstancesCount()
198+
}
199+
shardSize := l.getShardSize(userID)
200+
// If shuffle sharding is enabled and the total number of ingesters in the zone is greater than the
201+
// expected number of ingesters per sharded zone, then we should honor the latter because series/metadata
202+
// cannot be written to more ingesters than that.
203+
if shardSize > 0 {
204+
ingestersInZoneCount = util_math.Min(ingestersInZoneCount, util.ShuffleShardExpectedInstancesPerZone(shardSize, zonesCount))
197205
}
198206

199-
// If the number of available ingesters is greater than the tenant's shard
200-
// size, then we should honor the shard size because series/metadata won't
201-
// be written to more ingesters than it.
202-
if shardSize := l.getShardSize(userID); shardSize > 0 {
203-
// We use Min() to protect from the case the expected shard size is > available ingesters.
204-
numIngesters = util_math.Min(numIngesters, util.ShuffleShardExpectedInstances(shardSize, l.getNumZones()))
207+
// This may happen, for example when the total number of ingesters is asynchronously updated, or
208+
// when zone-aware replication is enabled but ingesters in a zone have been scaled down.
209+
// In those cases we ignore the global limit.
210+
if ingestersInZoneCount == 0 {
211+
return 0
205212
}
206213

207-
return int((float64(globalLimit) / float64(numIngesters)) * float64(l.replicationFactor))
214+
// Global limit is equally distributed among all the active zones.
215+
// The portion of global limit related to each zone is then equally distributed
216+
// among all the ingesters belonging to that zone.
217+
return int((float64(globalLimit*l.replicationFactor) / float64(zonesCount)) / float64(ingestersInZoneCount))
208218
}
209219

210220
func (l *Limiter) getShardSize(userID string) int {
211221
return l.limits.IngestionTenantShardSize(userID)
212222
}
213223

214-
func (l *Limiter) getNumZones() int {
224+
func (l *Limiter) getZonesCount() int {
215225
if l.zoneAwarenessEnabled {
216226
return util_math.Max(l.ring.ZonesCount(), 1)
217227
}

0 commit comments

Comments
 (0)