|
| 1 | +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; |
| 2 | + |
| 3 | +{ |
| 4 | + new(this): { |
| 5 | + local instanceLabel = xtd.array.slice(this.config.instanceLabels, -1)[0], |
| 6 | + local firstInstanceLabel = this.config.instanceLabels[0], |
| 7 | + prometheusAlerts: { |
| 8 | + groups: [ |
| 9 | + { |
| 10 | + name: this.config.uid + '-alerts', |
| 11 | + rules: [ |
| 12 | + { |
| 13 | + alert: 'PineconeHighQueryLatencyWarning', |
| 14 | + expr: 'rate(pinecone_db_op_query_duration_sum{%(filteringSelector)s}[5m]) / clamp_min(rate(pinecone_db_op_query_count{%(filteringSelector)s}[5m]), 1) > (%(queryLatencySimpleWarningMs)s / 1000)' % this.config { |
| 15 | + queryLatencySimpleWarningMs: this.config.alertsQueryLatencySimpleWarningMs, |
| 16 | + }, |
| 17 | + 'for': '5m', |
| 18 | + keep_firing_for: '5m', |
| 19 | + labels: { |
| 20 | + severity: 'warning', |
| 21 | + }, |
| 22 | + annotations: { |
| 23 | + summary: 'Query latency exceeds warning thresholds, indicating performance degradation in query operations.', |
| 24 | + description: 'Query latency on {{ $labels.%s }} (index: {{ $labels.%s }}) is {{ printf "%%.3f" $value }}s. This exceeds the warning threshold: > %sms.' % [ |
| 25 | + firstInstanceLabel, |
| 26 | + instanceLabel, |
| 27 | + this.config.alertsQueryLatencySimpleWarningMs, |
| 28 | + ], |
| 29 | + }, |
| 30 | + }, |
| 31 | + { |
| 32 | + alert: 'PineconeHighQueryLatencyCritical', |
| 33 | + expr: 'rate(pinecone_db_op_query_duration_sum{%(filteringSelector)s}[5m]) / clamp_min(rate(pinecone_db_op_query_count{%(filteringSelector)s}[5m]), 1) > (%(queryLatencySimpleCriticalMs)s / 1000)' % this.config { |
| 34 | + queryLatencySimpleCriticalMs: this.config.alertsQueryLatencySimpleCriticalMs, |
| 35 | + }, |
| 36 | + 'for': '5m', |
| 37 | + keep_firing_for: '5m', |
| 38 | + labels: { |
| 39 | + severity: 'critical', |
| 40 | + }, |
| 41 | + annotations: { |
| 42 | + summary: 'Query latency exceeds critical thresholds, indicating performance degradation in query operations.', |
| 43 | + description: 'Query latency on {{ $labels.%s }} (index: {{ $labels.%s }}) is {{ printf "%%.3f" $value }}s. CRITICAL: This exceeds the critical threshold: > %sms.' % [ |
| 44 | + firstInstanceLabel, |
| 45 | + instanceLabel, |
| 46 | + this.config.alertsQueryLatencySimpleCriticalMs, |
| 47 | + ], |
| 48 | + }, |
| 49 | + }, |
| 50 | + { |
| 51 | + alert: 'PineconeHighUpsertLatencyWarning', |
| 52 | + expr: 'rate(pinecone_db_op_upsert_duration_sum{%(filteringSelector)s}[15m]) / clamp_min(rate(pinecone_db_op_upsert_count{%(filteringSelector)s}[15m]), 1) > (%(upsertLatencyWarningMs)s / 1000)' % this.config { |
| 53 | + upsertLatencyWarningMs: this.config.alertsUpsertLatencyWarningMs, |
| 54 | + }, |
| 55 | + 'for': '5m', |
| 56 | + keep_firing_for: '5m', |
| 57 | + labels: { |
| 58 | + severity: 'warning', |
| 59 | + }, |
| 60 | + annotations: { |
| 61 | + summary: 'Upsert latency exceeds warning thresholds, indicating performance degradation in upsert operations.', |
| 62 | + description: 'Upsert latency on {{ $labels.%s }} (index: {{ $labels.%s }}) is {{ printf "%%.3f" $value }}s. This exceeds the warning threshold: > %sms sustained.' % [ |
| 63 | + firstInstanceLabel, |
| 64 | + instanceLabel, |
| 65 | + this.config.alertsUpsertLatencyWarningMs, |
| 66 | + ], |
| 67 | + }, |
| 68 | + }, |
| 69 | + { |
| 70 | + alert: 'PineconeHighUpsertLatencyCritical', |
| 71 | + expr: 'rate(pinecone_db_op_upsert_duration_sum{%(filteringSelector)s}[15m]) / clamp_min(rate(pinecone_db_op_upsert_count{%(filteringSelector)s}[15m]), 1) > (%(upsertLatencyCriticalMs)s / 1000)' % this.config { |
| 72 | + upsertLatencyCriticalMs: this.config.alertsUpsertLatencyCriticalMs, |
| 73 | + }, |
| 74 | + 'for': '5m', |
| 75 | + keep_firing_for: '5m', |
| 76 | + labels: { |
| 77 | + severity: 'critical', |
| 78 | + }, |
| 79 | + annotations: { |
| 80 | + summary: 'Upsert latency exceeds critical thresholds, indicating performance degradation in upsert operations.', |
| 81 | + description: 'Upsert latency on {{ $labels.%s }} (index: {{ $labels.%s }}) is {{ printf "%%.3f" $value }}s. CRITICAL: This exceeds the critical threshold: > %sms sustained.' % [ |
| 82 | + firstInstanceLabel, |
| 83 | + instanceLabel, |
| 84 | + this.config.alertsUpsertLatencyCriticalMs, |
| 85 | + ], |
| 86 | + }, |
| 87 | + }, |
| 88 | + { |
| 89 | + alert: 'PineconeUnitBurnDownWarning', |
| 90 | + expr: ||| |
| 91 | + ( |
| 92 | + rate(pinecone_db_read_unit_count{%(filteringSelector)s}[30m]) |
| 93 | + / clamp_min(rate(pinecone_db_read_unit_count{%(filteringSelector)s}[30m] offset 30m), 1) |
| 94 | + > (%(unitBurnDownBaselineIncreaseWarning)s / 100) |
| 95 | + ) |
| 96 | + OR |
| 97 | + ( |
| 98 | + rate(pinecone_db_write_unit_total{%(filteringSelector)s}[30m]) |
| 99 | + / clamp_min(rate(pinecone_db_write_unit_total{%(filteringSelector)s}[30m] offset 30m), 1) |
| 100 | + > (%(unitBurnDownBaselineIncreaseWarning)s / 100) |
| 101 | + ) |
| 102 | + OR |
| 103 | + ( |
| 104 | + increase(pinecone_db_read_unit_count{%(filteringSelector)s}[1h]) > 0 |
| 105 | + AND |
| 106 | + 100 * 24 * increase(pinecone_db_read_unit_count{%(filteringSelector)s}[1h]) / clamp_min(pinecone_db_read_unit_budget{%(filteringSelector)s}, 1) > %(unitBurnDownBudgetUsageWarning)s |
| 107 | + ) |
| 108 | + OR |
| 109 | + ( |
| 110 | + increase(pinecone_db_write_unit_total{%(filteringSelector)s}[1h]) > 0 |
| 111 | + AND |
| 112 | + 100 * 24 * increase(pinecone_db_write_unit_total{%(filteringSelector)s}[1h]) / clamp_min(pinecone_db_write_unit_budget{%(filteringSelector)s}, 1) > %(unitBurnDownBudgetUsageWarning)s |
| 113 | + ) |
| 114 | + ||| % this.config { |
| 115 | + unitBurnDownBaselineIncreaseWarning: this.config.alertsUnitBurnDownBaselineIncreaseWarning, |
| 116 | + unitBurnDownBudgetUsageWarning: this.config.alertsUnitBurnDownBudgetUsageWarning, |
| 117 | + }, |
| 118 | + 'for': '5m', |
| 119 | + keep_firing_for: '10m', |
| 120 | + labels: { |
| 121 | + severity: 'warning', |
| 122 | + }, |
| 123 | + annotations: { |
| 124 | + summary: 'RU/WU usage increasing rapidly or nearing allocated limits, causing potential throttling or cost spikes.', |
| 125 | + description: 'Unit consumption on {{ $labels.%s }} (index: {{ $labels.%s }}) is high. This exceeds the warning threshold: either RU or WU rate > %s%% above 30-minute baseline or sustained usage over 1h > %s%% of allocated budget.' % [ |
| 126 | + firstInstanceLabel, |
| 127 | + instanceLabel, |
| 128 | + (this.config.alertsUnitBurnDownBaselineIncreaseWarning - 100), |
| 129 | + this.config.alertsUnitBurnDownBudgetUsageWarning, |
| 130 | + ], |
| 131 | + }, |
| 132 | + }, |
| 133 | + ], |
| 134 | + }, |
| 135 | + ], |
| 136 | + }, |
| 137 | + }, |
| 138 | +} |
0 commit comments