119 lines
4.0 KiB
YAML
119 lines
4.0 KiB
YAML
# vim: tw=2 sw=2 :
|
|
groups:
|
|
- name: node
|
|
rules:
|
|
- alert: Instance down
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: urgent
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is down"
|
|
- alert: Instance rebooted
|
|
expr: time() - node_boot_time_seconds < 60
|
|
labels:
|
|
severity: interesting
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} was rebooted"
|
|
- alert: Read-only filesystem
|
|
expr: node_filesystem_readonly{mountpoint=~"/|/var"} != 0
|
|
labels:
|
|
severity: urgent
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} read-only filesystem on {{ $labels.mountpoint }}"
|
|
- alert: High disk usage
|
|
expr: >
|
|
(node_filesystem_size_bytes{mountpoint=~"/|/var"}
|
|
- node_filesystem_avail_bytes{mountpoint=~"/|/var"})
|
|
/ node_filesystem_size_bytes{mountpoint=~"/|/var"}
|
|
> 0.90
|
|
labels:
|
|
severity: important
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} has high disk usage on {{ $labels.mountpoint }}"
|
|
- alert: Full disk
|
|
expr: >
|
|
(node_filesystem_size_bytes{mountpoint=~"/|/var"}
|
|
- node_filesystem_avail_bytes{mountpoint=~"/|/var"})
|
|
/ node_filesystem_size_bytes{mountpoint=~"/|/var"}
|
|
> 0.98
|
|
labels:
|
|
severity: urgent
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} has a full disk on {{ $labels.mountpoint }}"
|
|
- alert: High tmpfs usage
|
|
expr: >
|
|
(node_filesystem_size_bytes{mountpoint=~"/tmp"}
|
|
- node_filesystem_avail_bytes{mountpoint=~"/tmp"})
|
|
/ node_filesystem_size_bytes{mountpoint=~"/tmp"}
|
|
> 0.80
|
|
for: 5m
|
|
labels:
|
|
severity: urgent
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} has tmpfs usage"
|
|
- alert: High CPU usage
|
|
expr: &cpu_gt_75pct avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
|
|
<< : &brief
|
|
for: 5m
|
|
labels:
|
|
severity: interesting
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is under high CPU usage"
|
|
- alert: Sustained high CPU usage
|
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
|
|
<< : &sustained
|
|
for: 20m
|
|
labels:
|
|
severity: important
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is under sustained high CPU usage"
|
|
- alert: Prolonged high CPU usage
|
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="user"}[2m])) > 0.75
|
|
<< : &prolonged
|
|
for: 60m
|
|
labels:
|
|
severity: urgent
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is under sustained high CPU usage"
|
|
- alert: High network activity
|
|
expr: &net_gt_10mibsec >
|
|
(rate(node_network_receive_bytes_total{device=~"eth0|ens3|enp.*"}[5m]) / 1024^2
|
|
> 10) or (
|
|
rate(node_network_transmit_bytes_total{device=~"eth0|ens3|enp.*"}[5m]) / 1024^2
|
|
> 10)
|
|
<< : *brief
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} >10 MiB/s network use"
|
|
- alert: Sustained high network activity
|
|
expr: *net_gt_10mibsec
|
|
<< : *sustained
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} sustained >10 MiB/s network use"
|
|
- alert: Prolonged high network activity
|
|
expr: *net_gt_10mibsec
|
|
<< : *prolonged
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} prolonged >10 MiB/s network use"
|
|
- alert: Sustained high disk I/O
|
|
expr: &disk_gt_5mibsec >
|
|
(rate(node_disk_read_bytes_total{device=~"sd.*|vd.*"}[5m]) / 1024^2
|
|
> 20) or (
|
|
rate(node_disk_written_bytes_total{device=~"sd.*|vd.*"}[5m]) / 1024^2
|
|
> 20)
|
|
<< : *sustained
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} sustained >5 MiB/s disk I/O"
|
|
- alert: Prolonged high disk I/O
|
|
expr: *disk_gt_5mibsec
|
|
<< : *prolonged
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} prolonged >5 MiB/s disk I/O"
|
|
- alert: High memory usage
|
|
expr: >
|
|
avg_over_time(node_memory_MemAvailable_bytes[5m]) < 100 * 1024^2
|
|
labels:
|
|
severity: important
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} <100MiB available memory"
|