metrics.sr.ht/service_rules.yml

25 lines
990 B
YAML

# vim: tw=2 sw=2 :
groups:
- name: service
rules:
- alert: High rate of 500 errors on specific route
expr: rate(request_time_count{status="500"}[15m]) / ignoring(status) sum without(status) (rate(request_time_count[15m])) > 0.25 and (sum without (status) (rate(request_time_count[1h])) > 1/60)
for: 5m
labels:
severity: important
annotations:
summary: "{{ $labels.instance }} has a high rate of 500 errors on route {{ $labels.route }}"
- alert: High rate of 500 errors on an instance
expr: sum by(instance) (rate(request_time_count{status="500"}[15m])) / sum by(instance) (rate(request_time_count[15m])) > 0.25
for: 5m
labels:
severity: urgent
annotations:
summary: "{{ $labels.instance }} has a high rate of 500 errors"
- alert: Webhook queue queued up
expr: srht_webhooks_queue_length > 5
for: 5m
labels:
severity: important
annotations:
summary: "{{ $labels.instance }} webhooks have queued up"