metrics.sr.ht/node_rules.yml

119 lines
4.0 KiB
YAML

# vim: tw=2 sw=2 :
groups:
- name: node
rules:
- alert: Instance down
expr: up == 0
for: 2m
labels:
severity: urgent
annotations:
summary: "Instance {{ $labels.instance }} is down"
- alert: Instance rebooted
expr: time() - node_boot_time_seconds < 60
labels:
severity: interesting
annotations:
summary: "Instance {{ $labels.instance }} was rebooted"
- alert: Read-only filesystem
expr: node_filesystem_readonly{mountpoint=~"/|/var"} != 0
labels:
severity: urgent
annotations:
summary: "Instance {{ $labels.instance }} read-only filesystem on {{ $labels.mountpoint }}"
- alert: High disk usage
expr: >
(node_filesystem_size_bytes{mountpoint=~"/|/var"}
- node_filesystem_avail_bytes{mountpoint=~"/|/var"})
/ node_filesystem_size_bytes{mountpoint=~"/|/var"}
> 0.90
labels:
severity: important
annotations:
summary: "Instance {{ $labels.instance }} has high disk usage on {{ $labels.mountpoint }}"
- alert: Full disk
expr: >
(node_filesystem_size_bytes{mountpoint=~"/|/var"}
- node_filesystem_avail_bytes{mountpoint=~"/|/var"})
/ node_filesystem_size_bytes{mountpoint=~"/|/var"}
> 0.98
labels:
severity: urgent
annotations:
summary: "Instance {{ $labels.instance }} has a full disk on {{ $labels.mountpoint }}"
- alert: High tmpfs usage
expr: >
(node_filesystem_size_bytes{mountpoint=~"/tmp"}
- node_filesystem_avail_bytes{mountpoint=~"/tmp"})
/ node_filesystem_size_bytes{mountpoint=~"/tmp"}
> 0.80
for: 5m
labels:
severity: urgent
annotations:
summary: "Instance {{ $labels.instance }} has tmpfs usage"
- alert: High CPU usage
expr: &cpu_gt_75pct avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
<< : &brief
for: 5m
labels:
severity: interesting
annotations:
summary: "Instance {{ $labels.instance }} is under high CPU usage"
- alert: Sustained high CPU usage
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
<< : &sustained
for: 20m
labels:
severity: important
annotations:
summary: "Instance {{ $labels.instance }} is under sustained high CPU usage"
- alert: Prolonged high CPU usage
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
<< : &prolonged
for: 60m
labels:
severity: urgent
annotations:
summary: "Instance {{ $labels.instance }} is under sustained high CPU usage"
- alert: High network activity
expr: &net_gt_10mibsec >
(rate(node_network_receive_bytes_total{device=~"eth0|ens3|enp.*"}[5m]) / 1024^2
> 10) or (
rate(node_network_transmit_bytes_total{device=~"eth0|ens3|enp.*"}[5m]) / 1024^2
> 10)
<< : *brief
annotations:
summary: "Instance {{ $labels.instance }} >10 MiB/s network use"
- alert: Sustained high network activity
expr: *net_gt_10mibsec
<< : *sustained
annotations:
summary: "Instance {{ $labels.instance }} sustained >10 MiB/s network use"
- alert: Prolonged high network activity
expr: *net_gt_10mibsec
<< : *prolonged
annotations:
summary: "Instance {{ $labels.instance }} prolonged >10 MiB/s network use"
- alert: Sustained high disk I/O
expr: &disk_gt_5mibsec >
(rate(node_disk_read_bytes_total{device=~"sd.*|vd.*"}[5m]) / 1024^2
> 20) or (
rate(node_disk_written_bytes_total{device=~"sd.*|vd.*"}[5m]) / 1024^2
> 20)
<< : *sustained
annotations:
summary: "Instance {{ $labels.instance }} sustained >5 MiB/s disk I/O"
- alert: Prolonged high disk I/O
expr: *disk_gt_5mibsec
<< : *prolonged
annotations:
summary: "Instance {{ $labels.instance }} prolonged >5 MiB/s disk I/O"
- alert: High memory usage
expr: >
avg_over_time(node_memory_MemAvailable_bytes[5m]) < 100 * 1024^2
labels:
severity: important
annotations:
summary: "Instance {{ $labels.instance }} <100MiB available memory"