-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathprometheus_alerts.yml
More file actions
92 lines (83 loc) · 3.58 KB
/
Copy pathprometheus_alerts.yml
File metadata and controls
92 lines (83 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
groups:
- name: gulp-backend
rules:
- alert: GulpTargetDown
expr: up{job="gulp"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "gULP target is down"
description: "Prometheus cannot scrape the gULP /metrics endpoint."
- alert: GulpTaskQueueOldestQueuedTooOld
expr: max by (task_type) (gulp_redis_task_oldest_queued_age_seconds) > 300
for: 10m
labels:
severity: warning
annotations:
summary: "gULP task queue is not draining"
description: "The oldest queued {{ $labels.task_type }} task is older than 5 minutes."
- alert: GulpTaskPendingTooOld
expr: max by (task_type) (gulp_redis_task_oldest_pending_age_seconds) > 900
for: 5m
labels:
severity: critical
annotations:
summary: "gULP pending task is too old"
description: "A {{ $labels.task_type }} task has been pending for more than 15 minutes."
- alert: GulpTaskFailures
expr: sum by (task_type) (rate(gulp_redis_task_transition_total{action="failure",outcome="failed"}[5m])) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "gULP task failures"
description: "{{ $labels.task_type }} task handlers are failing."
- alert: GulpRequestStatsStaleOngoing
expr: max by (req_type) (gulp_request_stats_stale_ongoing) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "gULP request stats are stale"
description: "{{ $labels.req_type }} request stats have been ongoing without updates for at least 15 minutes."
- alert: GulpApiRequestRejected
expr: sum by (endpoint, reason, task_type, scope) (rate(gulp_api_request_rejected_total[5m])) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "gULP API admission rejects requests"
description: "{{ $labels.endpoint }} is rejecting {{ $labels.task_type }} work due to {{ $labels.reason }} at scope {{ $labels.scope }}."
- alert: GulpWebsocketEnqueueTimeouts
expr: sum by (server_id, socket_type) (rate(gulp_ws_enqueue_timeout_total[5m])) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "gULP websocket clients are not draining"
description: "Websocket enqueue timeouts are occurring on {{ $labels.server_id }} for {{ $labels.socket_type }} sockets."
- alert: GulpPayloadPointerResolutionFailures
expr: sum by (outcome) (rate(gulp_ws_payload_pointer_resolve_total{outcome=~"missing|error"}[5m])) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "gULP pub/sub payload pointer resolution is failing"
description: "Large payload pointer resolution has {{ $labels.outcome }} outcomes."
- alert: GulpRedisSubscriberErrors
expr: rate(gulp_redis_subscriber_errors_total[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "gULP Redis subscriber errors"
description: "Redis pub/sub subscriber errors are being reported by gULP."
- alert: GulpRedisPubSubReconnects
expr: rate(gulp_redis_pubsub_reconnect_total[5m]) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "gULP Redis pub/sub reconnects"
description: "Redis pub/sub reconnects are occurring; check Redis connectivity and subscriber logs."