Update alarms info (#11481)

* update apcupsd alarm's info
2021-09-27 15:31:51 +03:00 · 2021-09-27 15:31:51 +03:00 · 07946d9f0a
parent 44cf669e88
commit 07946d9f0a
58 changed files with 599 additions and 211 deletions
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@ -13,7 +13,11 @@ component: UPS
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 10m multiplier 1.5 max 1h
-     info: average UPS load over the last 10 minutes
+     info: Average UPS load over the last 10 minutes. \
+           High UPS load. \
+           It may result in either your UPS transferring to bypass or shutting down \
+           as a self-protection and safety measure due to overload. \
+           You should remove some attached equipment from the UPS.
       to: sitemgr

 # Discussion in https://github.com/netdata/netdata/pull/3928:
@ -31,7 +35,9 @@ component: UPS
     warn: $this < 100
     crit: $this < (($status == $CRITICAL) ? (60) : (50))
    delay: down 10m multiplier 1.5 max 1h
-     info: average UPS charge over the last minute
+     info: Average UPS charge over the last minute. \
+           The UPS is running on battery power. It will shut down if external power is not restored. \
+           You should prepare any attached equipment for the shutdown.
       to: sitemgr

 template: apcupsd_last_collected_secs
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@ -9,7 +9,8 @@ component: Exporting engine
    every: 1m
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
-     info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+     info: The backends subsystem is deprecated and will be removed soon. \
+           Migrate your configuration to exporting.conf.
       to: sysadmin

 # make sure we are sending data to backend
@ -25,7 +26,10 @@ component: Exporting engine
     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful buffering of backend data
+     info: Number of seconds since the last successful buffering of backend data. \
+           The backend subsystem failed to buffer metrics for a while. Some metrics are lost while exporting. \
+           It indicates that the backend destination is down or unreachable. \
+           Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
       to: dba

    alarm: backend_metrics_sent
@ -38,5 +42,8 @@ component: Exporting engine
    every: 10s
     warn: $this != 100
    delay: down 5m multiplier 1.5 max 1h
-     info: percentage of metrics sent to the backend server
+     info: Percentage of metrics sent to the backend server. \
+           The backends subsystem failed to send all metrics. Some metrics are lost while exporting. \
+           It indicates that the backend destination is down or unreachable. \
+           Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
       to: dba
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@ -9,9 +9,9 @@ component: Disk
    every: 1m
     warn: $this > 0
    delay: up 2m down 1h multiplier 1.5 max 2h
-     info: number of times data was read from the cache, \
-           the bucket was reused and invalidated in the last 10 minutes \
-           (when this occurs the data is reread from the backing device)
+     info: Number of bcache read races in the last minute. \
+           The bucket was reused and invalidated while reading from the cache. \
+           When this occurs the data is reread from the backing device.
       to: sysadmin

 template: bcache_cache_dirty
@ -25,6 +25,7 @@ component: Disk
     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: up 1m down 1h multiplier 1.5 max 2h
-     info: percentage of cache space used for dirty data and metadata \
-           (this usually means your SSD cache is too small)
+     info: Percentage of cache space used for dirty data and metadata. \
+           High block cache utilization by dirty data and metadata. \
+           This usually means your SSD cache is too small.
       to: sysadmin
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@ -11,9 +11,11 @@ component: Beanstalk
     warn: $this > 0
     crit: $this > 10
    delay: up 0 down 5m multiplier 1.2 max 1h
-     info: number of buried jobs across all tubes. \
-           You need to manually kick them so they can be processed. \
-           Presence of buried jobs in a tube does not affect new jobs.
+     info: Number of buried jobs across all tubes. \
+           There are buried jobs. \
+           It usually happens if something goes wrong while the consumer processes it. \
+           The presence of buried jobs in a tube does not affect new jobs. \
+           You need to manually kick the jobs, so they can be processed.
       to: sysadmin
      
 # get the number of buried jobs per queue
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@ -13,7 +13,10 @@ component: File system
     warn: $this > (($status >= $WARNING)  ? (90) : (95))
     crit: $this > (($status == $CRITICAL) ? (95) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: percentage of allocated BTRFS physical disk space
+     info: Percentage of allocated Btrfs physical disk space. \
+           Most of the Btrfs physical disk space is allocated. \
+           To fix it, first, try running Btrfs balance. \
+           If that does not help, consider deleting snapshots or adding more physical space to the pool.
       to: sysadmin

 template: btrfs_data
@ -30,7 +33,11 @@ component: File system
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: utilization of BTRFS data space
+     info: Percentage of used Btrfs data space. \
+           High Btrfs data space utilization. \
+           If there is enough unallocated memory, the data space will be automatically increased. \
+           Otherwise, to fix, first try to run a balance. \
+           If that does not help, you should add more physical space to the pool.
       to: sysadmin

 template: btrfs_metadata
@ -47,7 +54,11 @@ component: File system
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: utilization of BTRFS metadata space
+     info: Percentage of used Btrfs metadata space. \
+           High Btrfs metadata space utilization. \
+           If there is enough unallocated memory, the metadata space will be automatically increased. \
+           Otherwise, you may wish to run a balance on metadata only if you find you have very large amounts of \
+           metadata space allocated, but unused.
       to: sysadmin

 template: btrfs_system
@ -64,5 +75,7 @@ component: File system
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: utilization of BTRFS system space
+     info: Percentage of used Btrfs system space. \
+           High Btrfs system space utilization. \
+           If there is enough unallocated memory, the system space will be automatically increased.
       to: sysadmin
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@ -11,5 +11,7 @@ component: Ceph
     warn: $this > (($status >= $WARNING ) ? (85) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 5m multiplier 1.2 max 1h
-     info: cluster disk space utilization
+     info: Percentage of used cluster disk space. \
+           High disk space utilization. \
+           To fix this, consider adding a node or removing unneeded data from the cluster.
       to: sysadmin
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@ -14,7 +14,10 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: average cgroup CPU utilization over the last 10 minutes
+     info: Average CPU utilization over the last 10 minutes. \
+           High cgroup CPU utilization. \
+           The system will throttle the cgroup CPU usage when the usage is over the limit. \
+           To fix, increase the cgroup CPU limit.
       to: sysadmin

 template: cgroup_ram_in_use
@ -30,5 +33,8 @@ component: Memory
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: cgroup memory utilization
+     info: Percentage of used memory. \
+           High cgroup memory utilization. \
+           OOM will kill some processes when the utilization reaches 100%. \
+           To fix, increase the cgroup memory limit (if set).
       to: sysadmin
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@ -12,7 +12,9 @@ component: CockroachDB
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: storage capacity utilization
+     info: Percentage of used storage space. \
+           High storage capacity utilization. \
+           To fix, increase the space available for CockroachDB data.
       to: dba

 template: cockroachdb_used_usable_storage_capacity
@ -26,7 +28,9 @@ component: CockroachDB
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: storage usable space utilization
+     info: Percentage of storage usable space. \
+           High usable storage capacity utilization. \
+           To fix, increase the space available for CockroachDB data.
       to: dba

 # Replication
@ -41,7 +45,10 @@ component: CockroachDB
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of ranges with fewer live replicas than needed for quorum
+     info: Number of unavailable ranges. \
+           There are ranges with fewer live replicas than needed for quorum. \
+           If a majority of a range replicas are on nodes that are unavailable, \
+           then the entire range is unavailable and will be unable to process queries.
       to: dba

 template: cockroachdb_underreplicated_ranges
@ -54,7 +61,9 @@ component: CockroachDB
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of ranges with fewer live replicas than the replication target
+     info: Number of under-replicated ranges. \
+           There are ranges with fewer live replicas than the replication target. \
+           As soon as other nodes are available, they will replicate to them until they have reached their desired replication factor.
       to: dba

 # FD
@ -69,5 +78,7 @@ component: CockroachDB
    every: 10s
     warn: $this > 80
    delay: down 15m multiplier 1.5 max 1h
-     info: open file descriptors utilization (against softlimit)
+     info: Percentage of used file descriptors. \
+           High file descriptors utilization (against softlimit). \
+           To fix, adjust the file descriptors limit for the process or system-wide.
       to: dba
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@ -14,7 +14,11 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
+     info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal). \
+           High system CPU utilization. \
+           A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
+           You can check the CPU PSI charts if there is CPU contention and \
+           per-process CPU usage to find the top consumers.
       to: sysadmin

 template: 10min_cpu_iowait
@ -30,7 +34,9 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (20) : (40))
     crit: $this > (($status == $CRITICAL) ? (40) : (50))
    delay: down 15m multiplier 1.5 max 1h
-     info: average CPU iowait time over the last 10 minutes
+     info: Average CPU iowait time over the last 10 minutes. \
+           High system CPU iowait time. \
+           A constantly high value indicates that IO is a bottleneck, which can make the system run slower.
       to: sysadmin

 template: 20min_steal_cpu
@ -46,7 +52,10 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (20) : (30))
    delay: down 1h multiplier 1.5 max 2h
-     info: average CPU steal time over the last 20 minutes
+     info: Average CPU steal time over the last 20 minutes. \
+           High system CPU steal time. \
+           A large amount of steal time indicates CPU contention on the host system, which can reduce guest performance. \
+           To fix, increase the guest CPU priority or CPU quota, or run fewer guests on the host.
       to: sysadmin

 ## FreeBSD
@ -63,5 +72,8 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: average CPU utilization over the last 10 minutes (excluding nice)
+     info: Average CPU utilization over the last 10 minutes (excluding nice). \
+           High system CPU utilization. \
+           A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
+           You can check per-process CPU usage to find the top consumers.
       to: sysadmin
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@ -13,7 +13,8 @@ component: DB engine
    every: 10s
     crit: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
+     info: Number of filesystem errors in the last 10 minutes. \
+           Dbengine is experiencing filesystem errors (too many open files, wrong permissions, etc.).
       to: sysadmin

    alarm: 10min_dbengine_global_io_errors
@ -28,7 +29,8 @@ component: DB engine
    every: 10s
     crit: $this > 0
    delay: down 1h multiplier 1.5 max 3h
-     info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
+     info: Number of IO errors in the last 10 minutes. \
+           Dbengine is experiencing I/O errors (CRC errors, out of space, bad disk, etc.).
       to: sysadmin

    alarm: 10min_dbengine_global_flushing_warnings
@ -43,8 +45,9 @@ component: DB engine
    every: 10s
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 3h
-     info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
-           Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
+     info: Number of times when dbengine dirty pages were over 50% of the instance page cache in the last 10 minutes. \
+           Metric data is at risk of not being stored in the database. \
+           To remedy, reduce disk load or use faster disks.
       to: sysadmin

    alarm: 10min_dbengine_global_flushing_errors
@ -59,6 +62,7 @@ component: DB engine
    every: 10s
     crit: $this != 0
    delay: down 1h multiplier 1.5 max 3h
-     info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
-           Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
+     info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+           Some metric data was dropped to unblock data collection. \
+           To fix, reduce disk load or use faster disks.
       to: sysadmin
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@ -23,7 +23,10 @@ component: Disk
     warn: $this > (($status >= $WARNING ) ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: disk $family space utilization
+     info: Percentage of used space by disk $family. \
+           High disk space utilization. \
+           You may experience slowdowns and crashes if the disk is full. \
+           To fix, cleanup your disk or upgrade it.
       to: sysadmin

 template: disk_inode_usage
@ -40,7 +43,10 @@ component: Disk
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-     info: disk $family inode utilization
+     info: Percentage of used inodes by disk $family. \
+           High disk inode utilization. \
+           The number of inodes indicates the number of files and folders you have. \
+           To fix, clear cache files or delete unnecessary files and folders.
       to: sysadmin


@ -147,7 +153,10 @@ component: Disk
    every: 1m
     warn: $this > 98 * (($status >= $WARNING)  ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
-     info: average percentage of time $family disk was busy over the last 10 minutes
+     info: Average percentage of time $family disk was busy over the last 10 minutes. \
+           High disk load. \
+           The disk spent most of the time servicing read or write requests. \
+           If the disk controller processes the operations in parallel, the alarm does not necessarily indicate a high load.
       to: silent


--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@ -11,5 +11,6 @@ component: DNS
    every: 10s
     warn: $this == nan
    delay: up 20s down 5m multiplier 1.5 max 1h
-     info: average DNS query round trip time over the last 10 seconds
+     info: Average DNS query round trip time over the last 10 seconds. \
+           Failed to query the DNS server.
       to: sysadmin
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@ -11,5 +11,8 @@ component: Dnsmasq
     warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: down 5m
-     info: DHCP range utilization
+     info: Percentage of leased IP addresses. \
+           High DHCP range utilization. \
+           The number of DHCP addresses in use is close to the total number of provisioned DHCP addresses. \
+           To fix, increase the number of IP addresses on a subnet.
       to: sysadmin
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@ -7,5 +7,8 @@ component: Docker
    every: 10s
   lookup: average -10s
     crit: $this > 0
-     info: average number of unhealthy docker containers over the last 10 seconds
+     info: Average number of unhealthy docker containers over the last 10 seconds. \
+           There are unhealthy docker containers. \
+           Some containers are not running due to failed health checks. \
+           To find unhealthy containers use [docker ps -a].
       to: sysadmin
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@ -15,5 +15,8 @@ component: Cryptography
    every: 5m
     warn: $this < (($status >= $WARNING) ? (200) : (100))
    delay: down 1h multiplier 1.5 max 2h
-     info: minimum number of entries in the random numbers pool in the last 5 minutes
+     info: Minimum number of bits in the random numbers pool in the last 5 minutes. \
+           A low number of bits of entropy available. \
+           It may have a negative impact on the security and performance of the system. \
+           This can be fixed by installing the [haveged] or [rngd] daemon.
       to: silent
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@ -11,7 +11,11 @@ component: Exporting engine
     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful buffering of exporting data
+     info: Number of seconds since the last successful buffering of exporting data. \
+           Exporting engine failed to buffer metrics for a while. \
+           Some metrics were lost while exporting. \
+           It indicates that the exporting destination is down or unreachable. \
+           Short-term network availability problems might be fixed by increasing [buffer on failures] in exporting.conf.
       to: dba

 template: exporting_metrics_sent
@ -25,5 +29,9 @@ component: Exporting engine
    every: 10s
     warn: $this != 100
    delay: down 5m multiplier 1.5 max 1h
-     info: percentage of metrics sent to the external database server
+     info: Percentage of metrics sent to the external database server. \
+           Exporting engine failed to send all metrics. \
+           Some metrics were lost while exporting. \
+           It indicates that the exporting destination is down or unreachable. \
+           Short-term network availability problems might be fixed by increasing [buffer on failures] in exporting.conf.
       to: dba
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@ -25,7 +25,9 @@ component: Network
    every: 10s
     crit: $this == 0
    delay: down 30m multiplier 1.5 max 2h
-     info: reachability status of the network host (0: unreachable, 1: reachable)
+     info: Reachability status of the network host (0: unreachable, 1: reachable). \
+           The network host is unreachable. \
+           Most likely the host is down, or you are experiencing networking issues.
       to: sysadmin

 template: fping_host_latency
@ -42,7 +44,9 @@ component: Network
     warn: $this > $green OR $max > $red
     crit: $this > $red
    delay: down 30m multiplier 1.5 max 2h
-     info: average latency to the network host over the last 10 seconds
+     info: Average latency to the network host over the last 10 seconds. \
+           High latency to the network host. \
+           Most likely you are experiencing networking issues or the host is overloaded.
       to: sysadmin

 template: fping_packet_loss
@ -60,5 +64,7 @@ component: Network
     warn: $this > $green
     crit: $this > $red
    delay: down 30m multiplier 1.5 max 2h
-     info: packet loss ratio to the network host over the last 10 minutes
+     info: Packet loss ratio to the network host over the last 10 minutes. \
+           High packet loss to the network host. \
+           Most likely you are experiencing networking issues or the host is overloaded.
       to: sysadmin
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@ -10,5 +10,7 @@ component: Gearman
     warn: $this > 30000
     crit: $this > 100000
    delay: down 5m multiplier 1.5 max 1h
-     info: average number of queued jobs over the last 10 minutes
+     info: Average number of queued jobs over the last 10 minutes. \
+           A high number of queued jobs. \
+           To fix, add more workers.
       to: sysadmin
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@ -7,7 +7,8 @@ component: HAProxy
    every: 10s
   lookup: average -10s
     crit: $this > 0
-     info: average number of failed haproxy backend servers over the last 10 seconds
+     info: Average number of failed haproxy backend servers over the last 10 seconds. \
+           Some haproxy backend servers are inaccessible or offline.
       to: sysadmin

 template: haproxy_backend_status
@ -19,5 +20,6 @@ component: HAProxy
    every: 10s
   lookup: average -10s
     crit: $this > 0
-     info: average number of failed haproxy backends over the last 10 seconds
+     info: Average number of failed haproxy backends over the last 10 seconds. \
+           Some haproxy backends are offline.
       to: sysadmin
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@ -12,7 +12,9 @@ component: HDFS
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (80) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: summary datanodes space capacity utilization
+     info: Percentage of used space capacity across all datanodes. \
+           High datanodes space capacity utilization. \
+           To fix this, you can add additional disks/datanodes and rebalance the datanode/cluster.
       to: sysadmin


@ -28,7 +30,11 @@ component: HDFS
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of missing blocks
+     info: Number of missing blocks. \
+           There are missing blocks on some datanodes. \
+           It may indicate a problem with the underlying storage or filesystem of a datanode. \
+           To fix, you can find what blocks are missing using [hdfs fsck] and bring the blocks back online. \
+           If the file can not be restored, you should delete the file.
       to: sysadmin


@ -42,7 +48,11 @@ component: HDFS
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of datanodes marked stale due to delayed heartbeat
+     info: Number of stale datanodes. \
+           There are stale datanodes due to delayed heartbeat. \
+           Some datanodes did not respond to heartbeats. \
+           The stale datanodes are given the lowest priority for reads and writes. \
+           If the datanode is not back online for a while, it will be considered dead.
       to: sysadmin


@ -56,7 +66,9 @@ component: HDFS
    every: 10s
     crit: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of datanodes which are currently dead
+     info: Number of datanodes that are currently dead. \
+           There are dead datanodes. \
+           Any data that was registered to a dead datanode is not available to HDFS anymore.
       to: sysadmin


@ -72,5 +84,10 @@ component: HDFS
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-     info: number of failed volumes
+     info: Number of failed volumes. \
+           There are failed volumes on some datanodes. \
+           It may indicate a hardware failure or misconfiguration, e.g. duplicate mounts. \
+           By default, a single volume failing on a datanode will cause the entire node to go offline. \
+           The namenode must copy any under-replicated blocks that were lost on that node, \
+           causing a burst in network traffic and potential performance degradation.
       to: sysadmin
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@ -12,5 +12,7 @@ component: Disk
     warn: $this > $green OR $max > $red
     crit: $this > $red
    delay: down 30m multiplier 1.5 max 2h
-     info: average I/O latency over the last 10 seconds
+     info: Average I/O latency over the last 10 seconds. \
+           High disk I/O latency. \
+           It may indicate a high load of the disk or the disk is slow.
       to: sysadmin
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@ -14,7 +14,10 @@ component: IPC
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (70) : (90))
    delay: down 5m multiplier 1.5 max 1h
-     info: IPC semaphore utilization
+     info: Percentage of used IPC semaphores. \
+           High IPC semaphore utilization. \
+           A lack of available semaphores can affect application performance. \
+           To fix, adjust semaphore limits on your system.
       to: sysadmin

    alarm: semaphore_arrays_used
@ -30,5 +33,8 @@ component: IPC
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (70) : (90))
    delay: down 5m multiplier 1.5 max 1h
-     info: IPC semaphore arrays utilization
+     info: Percentage of used IPC semaphore arrays. \
+           High IPC semaphore arrays utilization. \
+           A lack of available semaphore arrays can affect application performance. \
+           To fix, adjust semaphore limits on your system.
       to: sysadmin
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@ -10,5 +10,6 @@ component: IPFS
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: IPFS datastore utilization
+     info: Percentage of used IPFS datastore space. \
+           High IPFS storage repository space utilization.
       to: sysadmin
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@ -9,7 +9,9 @@ component: IPMI
     warn: $this > 0
     crit: $critical > 0
    delay: up 5m down 15m multiplier 1.5 max 1h
-     info: number of IPMI sensors in non-nominal state
+     info: Number of IPMI sensors in the non-nominal state. \
+           There are IPMI sensors in the warning or critical state. \
+           You can find the system sensor type, state, and threshold using [ipmi-sensors].
       to: sysadmin

    alarm: ipmi_events
@ -22,5 +24,8 @@ component: IPMI
    every: 10s
     warn: $this > 0
    delay: up 5m down 15m multiplier 1.5 max 1h
-     info: number of events in the IPMI System Event Log (SEL)
+     info: Number of events in the IPMI System Event Log (SEL). \
+           IPMI System Event Log (SEL) is not empty. \
+           It contains critical, warning, and informational events. \
+           To view them use [ipmitool sel list].
       to: sysadmin
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@ -29,7 +29,8 @@ component: Kubelet
    every: 10s
     warn: $this > 0
    delay: down 1m multiplier 1.5 max 2h
-     info: number of failed Token() requests to the alternate token source
+     info: Number of failed get authentication token requests. \
+           There are failed get authentication token requests to Google Compute Engine (alternate token source).
       to: sysadmin

 # Docker and runtime operation errors
@ -70,7 +71,7 @@ component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
    units: microseconds
    every: 10s
-     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
+     info: Average pod relisting duration over the last minute (quantile 0.5).

 template: kubelet_10s_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
@ -84,8 +85,9 @@ component: Kubelet
     warn: $this > (($status >= $WARNING)?(100):(200))
     crit: $this > (($status >= $WARNING)?(200):(400))
    delay: down 1m multiplier 1.5 max 2h
-     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-           compared to the last minute (quantile 0.5)
+     info: Ratio of average pod relisting duration over the last 10 seconds, compared to the last minute (quantile 0.5). \
+           Relisting time has increased significantly. \
+           It may indicate some problems with the container runtime engine.
       to: sysadmin

 # quantile 0.9
@ -98,7 +100,7 @@ component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
    units: microseconds
    every: 10s
-     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
+     info: Average pod relisting duration over the last minute (quantile 0.9).

 template: kubelet_10s_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
@ -112,8 +114,9 @@ component: Kubelet
     warn: $this > (($status >= $WARNING)?(200):(400))
     crit: $this > (($status >= $WARNING)?(400):(800))
    delay: down 1m multiplier 1.5 max 2h
-     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-           compared to the last minute (quantile 0.9)
+     info: Ratio of average pod relisting duration over the last 10 seconds, compared to the last minute (quantile 0.9). \
+           Relisting time has increased significantly. \
+           It may indicate some problems with the container runtime engine.
       to: sysadmin

 # quantile 0.99
@ -126,7 +129,7 @@ component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
    units: microseconds
    every: 10s
-     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
+     info: Average pod relisting duration over the last minute (quantile 0.99).

 template: kubelet_10s_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
@ -140,6 +143,7 @@ component: Kubelet
     warn: $this > (($status >= $WARNING)?(400):(800))
     crit: $this > (($status >= $WARNING)?(800):(1200))
    delay: down 1m multiplier 1.5 max 2h
-     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-           compared to the last minute (quantile 0.99)
+     info: Ratio of average pod relisting duration over the last 10 seconds, compared to the last minute (quantile 0.99). \
+           Relisting time has increased significantly. \
+           It may indicate some problems with the container runtime engine.
       to: sysadmin
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@ -11,5 +11,8 @@ component: Battery
     warn: $this < 10
     crit: $this < 5
    delay: up 30s down 5m multiplier 1.2 max 1h
-     info: percentage of remaining power supply capacity
+     info: Percentage of remaining power supply capacity. \
+           The remaining power supply capacity is low. \
+           The system can run out of power and switch off soon. \
+           Prepare the system for the shutdown.
       to: sysadmin
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@ -32,7 +32,11 @@ component: Load
    every: 1m
     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
    delay: down 15m multiplier 1.5 max 1h
-     info: system fifteen-minute load average
+     info: System fifteen-minute load average. \
+           High system fifteen-minute load average. \
+           A constantly high value indicates that your system is overloaded. \
+           It includes both CPU and I/O demand. \
+           You might want to check per-process CPU/disk usage to find the top consumers.
       to: sysadmin

    alarm: load_average_5
@ -47,7 +51,11 @@ component: Load
    every: 1m
     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
    delay: down 15m multiplier 1.5 max 1h
-     info: system five-minute load average
+     info: System five-minute load average. \
+           High system five-minute load average. \
+           A constantly high value indicates that your system is overloaded. \
+           It includes both CPU and I/O demand. \
+           You might want to check per-process CPU/disk usage to find the top consumers.
       to: sysadmin

    alarm: load_average_1
@ -62,5 +70,9 @@ component: Load
    every: 1m
     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
    delay: down 15m multiplier 1.5 max 1h
-     info: system one-minute load average
+     info: System one-minute load average. \
+           High system one-minute load average. \
+           A constantly high value indicates that your system is overloaded. \
+           It includes both CPU and I/O demand. \
+           You might want to check per-process CPU/disk usage to find the top consumers.
       to: sysadmin
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@ -20,8 +20,10 @@ component: RAID
    every: 10s
     calc: $down
     crit: $this > 0
-     info: number of devices in the down state for the $family array. \
-           Any number > 0 indicates that the array is degraded.
+     info: Number of devices in the down state for the $family array. \
+           The array is degraded. Some array devices are missing. \
+           To fix, bring them back online or replace the faulty ones. \
+           To find faulty devices use [mdadm --detail RAIDDEVICE].
       to: sysadmin

 template: mdstat_mismatch_cnt
@ -35,7 +37,9 @@ component: RAID
    every: 60s
     warn: $this > 1024
    delay: up 30m
-     info: number of unsynchronized blocks for the $family array
+     info: Number of unsynchronized blocks for the $family array. \
+           A high number of unsynchronized blocks for the $family array. \
+           This might indicate that data on the array is corrupted.
       to: sysadmin

 template: mdstat_nonredundant_last_collected
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@ -11,7 +11,7 @@ component: RAID
    every: 10s
     crit: $this > 0
    delay: down 5m multiplier 2 max 10m
-     info: adapter is in the degraded state (0: false, 1: true)
+     info: Adapter is in the degraded state (0: false, 1: true).
       to: sysadmin

 ## Physical Disks
@ -26,7 +26,12 @@ component: RAID
    every: 10s
     warn: $this > 0
    delay: up 1m down 5m multiplier 2 max 10m
-     info: number of physical drive predictive failures
+     info: Number of physical drive predictive failures. \
+           There are physical drive predictive failures. \
+           The failure prediction function for the hard disk drives determines the risk of a failure in advance \
+           and issues a warning when the risk is high. \
+           A hard disk can still operate normally but may fail in the near future. \
+           You might want to consider replacing the disk.
       to: sysadmin

 template: megacli_pd_media_errors
@ -39,7 +44,10 @@ component: RAID
    every: 10s
     warn: $this > 0
    delay: up 1m down 5m multiplier 2 max 10m
-     info: number of physical drive media errors
+     info: Number of physical drive media errors. \
+           There are physical drive media errors. \
+           It may indicate that a bad sector was found on the media, there is a mechanical failure of the device, \
+           the host device detected an invalid sequence, or the target device is missing.
       to: sysadmin

 ## Battery Backup Units (BBU)
@ -54,7 +62,11 @@ component: RAID
    every: 10s
     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
-     info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
+     info: Average battery backup unit relative state of charge over the last 10 seconds. \
+           The state of charge is low. \
+           The relative state of charge is an indication of full charge capacity percentage in relation to the design capacity. \
+           A constantly low value may indicate that the battery is worn out. \
+           You might want to consider changing the battery.
       to: sysadmin

 template: megacli_bbu_cycle_count
@ -67,5 +79,8 @@ component: RAID
    every: 10s
     warn: $this >= 100
     crit: $this >= 500
-     info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
+     info: Average battery backup unit charge cycles count over the last 10 seconds. \
+           A high number of full recharge cycles. \
+           It affects the battery capacity. \
+           You might want to consider changing the battery.
       to: sysadmin
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@ -12,7 +12,10 @@ component: Memcached
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: up 0 down 15m multiplier 1.5 max 1h
-     info: cache memory utilization
+     info: Percentage of used memory. \
+           High cache memory utilization. \
+           If you are getting close to 100% you will probably start experiencing evictions. \
+           Consider increasing the cache size.
       to: dba


@ -27,7 +30,7 @@ component: Memcached
     calc: ($this - $available) / (($now - $after) / 3600)
    units: KB/hour
    every: 1m
-     info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
+     info: Average rate the cache fills up (positive), or frees up (negative) space over the last hour.


 # find the hours remaining until memcached cache is full
@ -43,6 +46,8 @@ component: Memcached
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
    delay: down 15m multiplier 1.5 max 1h
-     info: estimated time the cache will run out of space \
-           if the system continues to add data at the same rate as the past hour
+     info: Estimated time until the cache runs out of space \
+           if the system continues to add data at the same rate as the past hour. \
+           The cache will run out of space soon. \
+           If you are getting close to 100% you will probably start experiencing evictions.
       to: dba
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@ -13,7 +13,10 @@ component: Memory
    every: 1m
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC correctable errors in the last 10 minutes
+     info: Number of ECC correctable errors in the last 10 minutes. \
+           There are ECC correctable errors. \
+           This event does not immediately lead to problems, but it may indicate one of the DIMM modules is slowly failing. \
+           Check contacts and, if this error occurs more than once, consider replacing the DIMM as a preventative measure.
       to: sysadmin

    alarm: 1hour_ecc_memory_uncorrectable
@ -28,7 +31,11 @@ component: Memory
    every: 1m
     crit: $this > 0
    delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC uncorrectable errors in the last 10 minutes
+     info: Number of ECC uncorrectable errors in the last 10 minutes. \
+           There are ECC uncorrectable errors. This is a fatal issue. \
+           While the error may be due to a failing DRAM chip, \
+           it can also be caused by incorrect seating or improper contact between the socket and DIMM. \
+           Check contacts and consider replacing the DIMM as a preventative measure.
       to: sysadmin

    alarm: 1hour_memory_hw_corrupted
@ -43,5 +50,9 @@ component: Memory
    every: 10s
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 1h
-     info: amount of memory corrupted due to a hardware failure
+     info: Amount of memory corrupted due to a hardware failure. \
+           Memory is corrupted due to a hardware failure. \
+           While the error may be due to a failing DRAM chip, \
+           it can also be caused by incorrect seating or improper contact between the socket and DIMM. \
+           Check contacts and consider replacing the DIMM as a preventative measure.
       to: sysadmin
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@ -12,7 +12,11 @@ component: MySQL
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (20))
    delay: down 5m multiplier 1.5 max 1h
-     info: number of slow queries in the last 10 seconds
+     info: Number of slow queries in the last 10 seconds. \
+           A high number of slow queries. \
+           A query is slow if it has taken more than [long_query_time] seconds. \
+           The value is measured in real-time, not CPU time. \
+           If you are concerned with query execution time, check system load and consider optimizing the queries.
       to: dba


@ -52,7 +56,10 @@ component: MySQL
     warn: $this > (($status >= $WARNING)  ? (10) : (25))
     crit: $this > (($status == $CRITICAL) ? (25) : (50))
    delay: down 30m multiplier 1.5 max 1h
-     info: ratio of waited table locks over the last 10 seconds
+     info: Ratio of waited table locks over the last 10 seconds. \
+           High ratio of waited table locks. \
+           If this is constantly high and you have performance problems, \
+           you should first optimize your queries and then either split your table or tables or use replication.
       to: dba


@ -70,7 +77,9 @@ component: MySQL
     warn: $this > (($status >= $WARNING)  ? (60) : (70))
     crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: down 15m multiplier 1.5 max 1h
-     info: client connections utilization
+     info: Percentage of used client connections. \
+           High client connections utilization. \
+           To fix, increase the number of client connections using [max_connections] MySQL system variable.
       to: dba


@ -87,7 +96,10 @@ component: MySQL
    every: 10s
     crit: $this == 0
    delay: down 5m multiplier 1.5 max 1h
-     info: replication status (0: stopped, 1: working)
+     info: Replication status (0: stopped, 1: working). \
+           Replication is not working. \
+           Either both or one of the I/O and SQL threads are not running. \
+           Checking error.log may help to identify the problem.
       to: dba

 template: mysql_replication_lag
@ -101,8 +113,10 @@ component: MySQL
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (30))
    delay: down 15m multiplier 1.5 max 1h
-     info: difference between the timestamp of the latest transaction processed by the SQL thread and \
-           the timestamp of the same transaction when it was processed on the master
+     info: The number of seconds that the replica is behind the master. \
+           The replication SQL thread is far behind processing the source binary log. \
+           A constantly high value (or an increasing one) indicates that the replica is unable \
+           to handle events from the source in a timely fashion.
       to: dba


@ -131,7 +145,9 @@ component: MySQL
     warn: $this > $mysql_galera_cluster_size_max_2m
     crit: $this < $mysql_galera_cluster_size_max_2m
    delay: up 20s down 5m multiplier 1.5 max 1h
-     info: current galera cluster size, compared to the maximum size in the last 2 minutes
+     info: Current Galera cluster size, compared to the maximum size in the last 2 minutes. \
+           Galera cluster size has changed. \
+           It may indicate a network connectivity problem, or maybe MySQL is down on one node.
       to: dba

 # galera node state
@ -146,8 +162,9 @@ component: MySQL
     warn: $this == 2 OR $this == 3
     crit: $this == 0 OR $this == 1 OR $this >= 5
    delay: up 30s down 5m multiplier 1.5 max 1h
-     info: galera node state \
-           (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
+     info: Galera node state (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent). \
+           Galera node is not synced. \
+           It may indicate that the node lost its connection to the Primary Component due to network partition.
       to: dba


@ -162,7 +179,8 @@ component: MySQL
    every: 10s
     crit: $mysql_galera_cluster_state != nan AND $this != 0
    delay: up 30s down 5m multiplier 1.5 max 1h
-     info: galera node cluster component status \
+     info: Galera node cluster component status \
           (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
-           Any other value than primary indicates that the node is part of a nonoperational component.
+           The node is a part of a nonoperational component. \
+           It indicates that the node has lost a quorum or is unable to connect to other nodes/cluster.
       to: dba
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@ -31,7 +31,9 @@ component: Network
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
    delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average inbound utilization for the network interface $family over the last minute
+     info: Average inbound utilization for the network interface $family over the last minute. \
+           High inbound utilization. \
+           Network delays and packet drops are expected when you are getting closer to 100%.
       to: sysadmin

 template: 1m_sent_traffic_overflow
@ -48,7 +50,9 @@ component: Network
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
    delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average outbound utilization for the network interface $family over the last minute
+     info: Average outbound utilization for the network interface $family over the last minute. \
+           High outbound utilization. \
+           Network delays and packet drops are expected when you are getting closer to 100%.
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -101,7 +105,9 @@ component: Network
    every: 1m
     warn: $this >= 2
    delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+     info: Ratio of inbound dropped packets for the network interface $family over the last 10 minutes. \
+           Network interface $family dropped many packets. \
+           It means packets were received but not processed, e.g. due to lack of resources or unsupported protocol.
       to: sysadmin

 template: outbound_packets_dropped_ratio
@ -118,7 +124,9 @@ component: Network
    every: 1m
     warn: $this >= 2
    delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+     info: Ratio of outbound dropped packets for the network interface $family over the last 10 minutes. \
+           Network interface $family dropped many packets. \
+           It means packets were dropped on their way to transmission, e.g. due to lack of resources.
       to: sysadmin

 template: wifi_inbound_packets_dropped_ratio
@ -135,7 +143,9 @@ component: Network
    every: 1m
     warn: $this >= 10
    delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+     info: Ratio of inbound dropped packets for the network interface $family over the last 10 minutes. \
+           Network interface $family dropped many packets. \
+           It means packets were received but not processed, e.g. due to lack of resources or unsupported protocol.
       to: sysadmin

 template: wifi_outbound_packets_dropped_ratio
@ -152,7 +162,9 @@ component: Network
    every: 1m
     warn: $this >= 10
    delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+     info: Ratio of outbound dropped packets for the network interface $family over the last 10 minutes. \
+           Network interface $family dropped many packets. \
+           It means packets were dropped on their way to transmission, e.g. due to lack of resources.
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -171,7 +183,9 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of inbound errors for the network interface $family in the last 10 minutes
+     info: Number of inbound errors for the network interface $family in the last 10 minutes. \
+           Network interface $family received many bad packets. \
+           It includes length, CRC, frame errors, etc.
       to: sysadmin

 template: interface_outbound_errors
@ -187,7 +201,9 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of outbound errors for the network interface $family in the last 10 minutes
+     info: Number of outbound errors for the network interface $family in the last 10 minutes. \
+           Network interface $family experienced many transmit problems. \
+           It includes aborted, carrier, FIFO, heartbeat, window errors, etc.
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -211,7 +227,11 @@ component: Network
    every: 1m
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 2h
-     info: number of FIFO errors for the network interface $family in the last 10 minutes
+     info: Number of FIFO errors for the network interface $family in the last 10 minutes. \
+           Network interface $family experienced FIFO errors. \
+           It indicates that the NIC is not able to handle the peak load of \
+           incoming/outgoing packets with the current ring buffer size. \
+           In case of a high packet drop rate, increasing the appropriate ring buffer can fix the issue.
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -251,6 +271,8 @@ component: Network
     warn: $this > (($status >= $WARNING)?(200):(5000))
     crit: $this > (($status == $CRITICAL)?(5000):(6000))
  options: no-clear-notification
-     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
-           compared to the rate over the last minute
+     info: Ratio of an average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute. \
+           The number of received packets has increased significantly. \
+           It may indicate a broadcast/multicast storm or DoS attack.
       to: sysadmin
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@ -15,5 +15,8 @@ component: Network
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
-     info: netfilter connection tracker table size utilization
+     info: Percentage of used netfilter tracked connections. \
+           High netfilter connection tracking state table size utilization. \
+           Network delays and packet drops are expected when you are getting closer to 100%. \
+           To fix, increase the table size.
       to: sysadmin
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@ -12,7 +12,8 @@ component: Pi-hole
     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
     crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
    delay: up 2m down 5m
-     info: percentage of blocked dns queries over the last 24 hour
+     info: Percentage of blocked DNS queries over the last 24 hours. \
+           A high percentage of blocked DNS queries.
       to: sysadmin


@ -29,7 +30,9 @@ component: Pi-hole
     calc: $ago
     warn: $this > 60 * 60 * 24 * 8
     crit: $this > 60 * 60 * 24 * 8 * 2
-     info: gravity.list (blocklist) file last update time
+     info: Blocklist file last update time. \
+           The blocklist file has not been updated for a long time. \
+           To rebuild the blocklist, run the [pihole -g] command.
       to: sysadmin

 # Gravity file check (gravity.list).
@ -44,7 +47,9 @@ component: Pi-hole
     calc: $file_exists
     crit: $this != 1
    delay: up 2m down 5m
-     info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
+     info: Blocklist file state (0: exists, 1: not-exists). \
+           The blocklist file does not exist. \
+           To rebuild the blocklist, run the [pihole -g] command.
       to: sysadmin

 # Pi-hole's ability to block unwanted domains.
@ -60,5 +65,7 @@ component: Pi-hole
     calc: $enabled
     warn: $this != 1
    delay: up 2m down 5m
-     info: unwanted domains blocking status (0: enabled, 1: disabled)
+     info: Unwanted domains blocking status (0: enabled, 1: disabled). \
+           The ability of Pi-hole to block unwanted domains is disabled. \
+           To fix, run the [pihole enable] command.
       to: sysadmin
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@ -25,7 +25,9 @@ component: TCP endpoint
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of timeouts over the last 5 minutes
+     info: Average ratio of timeouts over the last 5 minutes. \
+           Too many timeouts. The monitored endpoint is unreachable. \
+           Most likely you are experiencing networking issues or the host/service is overloaded.
       to: sysadmin

 template: portcheck_connection_fails
@ -40,5 +42,7 @@ component: TCP endpoint
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of failed connections over the last 5 minutes
+     info: Average ratio of failed connections over the last 5 minutes. \
+           Too many failed connections. The monitored endpoint is unreachable. \
+           Most likely the service is no longer running or access is denied by a firewall.
       to: sysadmin
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@ -12,5 +12,7 @@ component: Processes
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
-     info: system process IDs (PID) space utilization
+     info: Percentage of used PIDs. \
+           High system process IDs (PID) space utilization. \
+           If this value is 100% then the system can not start new processes.
       to: sysadmin
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@ -26,7 +26,11 @@ component: Memory
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: system memory utilization
+     info: Percentage of used RAM. \
+           High RAM utilization. \
+           It may affect the performance of applications. \
+           If there is no swap space available, OOM Killer can start killing processes. \
+           You might want to check per-process memory usage to find the top consumers.
       to: sysadmin

    alarm: ram_available
@ -42,7 +46,11 @@ component: Memory
     warn: $this < (($status >= $WARNING)  ? (15) : (10))
     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
    delay: down 15m multiplier 1.5 max 1h
-     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+     info: Percentage of an estimated amount of RAM available for userspace processes, without causing swapping. \
+           Low amount of available memory. \
+           It may affect the performance of applications. \
+           If there is no swap space available, OOM Killer can start killing processes. \
+           You might want to check per-process memory usage to find the top consumers.
       to: sysadmin

      alarm: oom_kill
@ -55,7 +63,9 @@ component: Memory
       warn: $this > 0
      delay: down 10m
 host labels: _is_k8s_node = false
-       info: number of out of memory kills in the last 30 minutes
+       info: Number of out of memory kills in the last 30 minutes. \
+             Some processes got killed by OOM Killer. \
+             To fix, decrease memory usage, adjust memory limits for cgroups, or add more memory/swap to the system.
         to: sysadmin

 ## FreeBSD
@ -72,7 +82,11 @@ component: Memory
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: system memory utilization
+     info: Percentage of used RAM. \
+           High RAM utilization. \
+           It may affect the performance of applications. \
+           If there is no swap space available, OOM Killer can start killing processes. \
+           You might want to check per-process memory usage to find the top consumers.
       to: sysadmin

    alarm: ram_available
@ -88,5 +102,9 @@ component: Memory
     warn: $this < (($status >= $WARNING)  ? (15) : (10))
     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
    delay: down 15m multiplier 1.5 max 1h
-     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+     info: Percentage of an estimated amount of RAM available for userspace processes without causing swapping. \
+           Low amount of available memory. \
+           It may affect the performance of applications. \
+           If there is no swap space available, OOM Killer can start killing processes. \
+           You might want to check per-process memory usage to find the top consumers.
       to: sysadmin
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@ -8,7 +8,9 @@ component: Redis
    every: 10s
     crit: $rdb_last_bgsave_status != 0
    units: ok/failed
-     info: status of the last RDB save operation (0: ok, 1: error)
+     info: Status of the last RDB save operation (0: ok, 1: error). \
+           Redis failed to save the RDB snapshot on disk. \
+           Check Redis logs for details about the error.
    delay: down 5m multiplier 1.5 max 1h
       to: dba

@ -22,6 +24,9 @@ component: Redis
     warn: $rdb_bgsave_in_progress > 600
     crit: $rdb_bgsave_in_progress > 1200
    units: seconds
-     info: duration of the on-going RDB save operation
+     info: Duration of the ongoing RDB save operation. \
+           Saving RDB snapshot on disk is taking too long. \
+           Possible reasons are that the dataset size is big or the lack of CPU resources. \
+           It may result in Redis stopping serving clients for some millisecond or even for one second.
    delay: down 5m multiplier 1.5 max 1h
       to: dba
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@ -12,5 +12,6 @@ component: Retroshare
     warn: $this < (($status >= $WARNING)  ? (120) : (100))
     crit: $this < (($status == $CRITICAL) ? (10)  : (1))
    delay: up 0 down 15m multiplier 1.5 max 1h
-     info: number of DHT peers
+     info: Number of DHT peers. \
+           A low number of DHT peers.
       to: sysadmin
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@ -9,7 +9,10 @@ component: Riak KV
    units: state machines
    every: 10s
     warn: $list_fsm_active > 0
-     info: number of currently running list keys finite state machines
+     info: Number of currently running list keys finite state machines. \
+           There are active list keys FSMs. \
+           A key listing in Riak is a very expensive operation, and should not be used in production \
+           as it will affect the performance of the cluster and not scale well.
       to: dba


@ -38,9 +41,9 @@ component: Riak KV
    every: 10s
     warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
     crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
-     info: average time between reception of client GET request and \
-           subsequent response to the client over the last 3 minutes, \
-           compared to the average over the last hour
+     info: Average GET request processing time over the last 3 minutes, compared to the average over the last hour. \
+           GET request processing time has increased significantly. \
+           It indicates that the server is overloaded.
    delay: down 5m multiplier 1.5 max 1h
       to: dba

@ -68,9 +71,9 @@ component: Riak KV
    every: 10s
     warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
     crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
-     info: average time between reception of client PUT request and \
-           subsequent response to the client over the last 3 minutes, \
-           compared to the average over the last hour
+     info: Average PUT request processing time over the last 3 minutes, compared to the average over the last hour. \
+           PUT request processing time has increased significantly. \
+           It indicates that the server is overloaded.
    delay: down 5m multiplier 1.5 max 1h
       to: dba

@ -89,5 +92,6 @@ component: Riak KV
    every: 10s
     warn: $this > 10000
     crit: $this > 100000
-     info: number of processes running in the Erlang VM
+     info: Number of processes running in the Erlang VM. \
+           The number of processes is high, you may see performance degradation due to scheduling overhead.
       to: dba
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@ -15,8 +15,11 @@ component: Network
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-     info: average number of dropped packets in the last minute \
-           due to exceeded net.core.netdev_max_backlog
+     info: Average number of dropped packets in the last minute due to exceeded netdev backlog queue. \
+           Netdev backlog queue is full. \
+           It results in packets received by the interface and not yet processed by the destined subsystem or \
+           userland application being dropped. \
+           To fix, increase the [net.core.netdev_max_backlog] limit.
       to: sysadmin

    alarm: 1min_netdev_budget_ran_outs
@ -31,9 +34,11 @@ component: Network
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-     info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
-           net.core.netdev_budget_usecs with work remaining over the last minute \
-           (this can be a cause for dropped packets)
+     info: Average number of times ksoftirq ran out of netdev_budget over the last minute. \
+           SoftIRQ was unable to process all packets available before the CPU budget was exhausted. \
+           It may result in packet drops and is typically caused by a high-bandwidth interface adding more packets \
+           to the receive buffer than can be processed during NAPI polling. \
+           To fix, increase the [net.core.netdev_budget] limit.
       to: silent

    alarm: 10min_netisr_backlog_exceeded
@ -48,7 +53,9 @@ component: Network
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-     info: average number of drops in the last minute \
-           due to exceeded sysctl net.route.netisr_maxqlen \
-           (this can be a cause for dropped packets)
+     info: Average number of dropped packets in the last minute due to exceeded netisr queue length. \
+           Netisr queue is full. \
+           It results in packets received by the interface and not yet processed by the destined subsystem or \
+           userland application being dropped. \
+           To fix, increase the [net.route.netisr_maxqlen] limit.
       to: sysadmin
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@ -15,7 +15,8 @@ component: Memory
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (20) : (30))
    delay: down 15m multiplier 1.5 max 1h
-     info: percentage of the system RAM swapped in the last 30 minutes
+     info: Percentage of the system RAM swapped in the last 30 minutes. \
+           A lot of system RAM was swapped.
       to: sysadmin

    alarm: used_swap
@ -25,11 +26,15 @@ component: Memory
 component: Memory
       os: linux freebsd
    hosts: *
-     calc: $used * 100 / ( $used + $free )
+     calc: ($used + $free) > 0 ? ($used * 100 / ($used + $free)) : 0
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 30s down 15m multiplier 1.5 max 1h
-     info: swap memory utilization
+     info: Percentage of used swap. \
+           High swap memory utilization. \
+           It may be a sign that the system is experiencing memory pressure, which can affect the performance of your system. \
+           If there is no RAM available, OOM Killer can start killing processes. \
+           You might want to check per-process swap usage to find the top consumers.
       to: sysadmin
--- a/health/health.d/synchronization.conf
+++ b/health/health.d/synchronization.conf
@ -6,7 +6,8 @@
   every: 1m
    warn: $this > 6
   delay: up 1m down 10m multiplier 1.5 max 1h
-    info: number of sync() system calls. \
-          Every call causes all pending modifications to filesystem metadata and \
+    info: Number of sync() system calls. \
+          A high number of sync() system calls. \
+          Every call is very expensive because it causes all pending modifications to filesystem metadata and \
          cached file data to be written to the underlying filesystems.
      to: sysadmin
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@ -12,7 +12,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd service units are in the failed state
+     info: One or more of the systemd service units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Socket units
@ -26,7 +29,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd socket units are in the failed state
+     info: One or more of the systemd socket units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Target units
@ -40,7 +46,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd target units are in the failed state
+     info: One or more of the systemd target units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Path units
@ -54,7 +63,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd path units are in the failed state
+     info: One or more of the systemd path units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Device units
@ -68,7 +80,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more the systemd device units are in the failed state
+     info: One or more of the systemd device units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Mount units
@ -82,7 +97,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more the systemd mount units are in the failed state
+     info: One or more of the systemd mount units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Automount units
@ -96,7 +114,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd automount units are in the failed state
+     info: One or more of the systemd automount units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Swap units
@ -110,7 +131,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd swap units are in the failed state
+     info: One or more of the systemd swap units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Scope units
@ -124,7 +148,10 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd scope units are in the failed state
+     info: One or more of the systemd scope units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin

 ## Slice units
@ -138,5 +165,8 @@ component: Systemd units
    every: 10s
     warn: $this != nan AND $this == 5
    delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd slice units are in the failed state
+     info: One or more of the systemd scope units are in the failed state. \
+           It means that the service failed in some way \
+           (process returned error code on exit or crashed, an operation timed out, or after too many restarts). \
+           To see the details use [systemctl status SERVICENAME].
       to: sysadmin
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@ -18,5 +18,7 @@ component: Network
     warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
     crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-     info: IPv4 TCP connections utilization
+     info: Percentage of used IPv4 TCP connections. \
+           High IPv4 TCP connections utilization. \
+           If this value is 100% then the system is no longer able to establish new TCP connections.
       to: sysadmin
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@ -31,7 +31,10 @@ component: Network
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-     info: average number of overflows in the TCP accept queue over the last minute
+     info: Average number of overflows in the TCP accept queue over the last minute. \
+           TCP accept queue is full. \
+           It may indicate SYN floods. \
+           To increase the queue length, adjust the [net.ipv4.tcp_max_syn_backlog] limit.
       to: sysadmin

 # THIS IS TOO GENERIC
@ -49,7 +52,10 @@ component: Network
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-     info: average number of dropped packets in the TCP accept queue over the last minute
+     info: Average number of dropped packets in the TCP accept queue over the last minute. \
+           The system is dropping incoming TCP connections. \
+           It can indicate accept queue overflow, out of memory, security issues, no route to a destination, etc. \
+           To fix overflows, increase the [net.ipv4.tcp_max_syn_backlog] limit.
       to: sysadmin


@ -74,8 +80,10 @@ component: Network
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (0) : (5))
    delay: up 10 down 5m multiplier 1.5 max 1h
-     info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
-           (SYN cookies were not enabled)
+     info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute. \
+           TCP SYN queue is full. The system is dropping incoming TCP SYN requests. \
+           It may indicate SYN floods. \
+           If you can determine that the traffic is legitimate, consider enabling SYN cookies.
       to: sysadmin

    alarm: 1m_tcp_syn_queue_cookies
@ -91,6 +99,8 @@ component: Network
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (0) : (5))
    delay: up 10 down 5m multiplier 1.5 max 1h
-     info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
+     info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute. \
+           TCP SYN queue is full. \
+           It may indicate SYN floods.
       to: sysadmin

--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@ -19,5 +19,9 @@ component: Network
     warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
     crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-     info: TCP memory utilization
+     info: Percentage of used TCP memory. \
+           High TCP memory utilization. \
+           If the TCP protocol uses more memory than the limit, the system will throw an OOM error and \
+           some applications become unresponsive. \
+           To increase the limit, adjust [net.ipv4.tcp_rmem] and [net.ipv4.tcp_wmem].
       to: sysadmin
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@ -20,5 +20,8 @@ component: Network
     warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
     crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-     info: orphan IPv4 TCP sockets utilization
+     info: Percentage of used orphan IPv4 TCP sockets. \
+           High orphan IPv4 TCP sockets utilization. \
+           When the limit is exceeded, orphaned connections (not attached to any user filehandle) are reset immediately. \
+           To increase the limit, adjust [net.ipv4.tcp_max_orphans].
       to: sysadmin
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@ -29,10 +29,9 @@ component: Network
     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
    delay: up 20s down 60m multiplier 1.2 max 2h
  options: no-clear-notification
-     info: average number of sent TCP RESETS over the last 10 seconds. \
-           This can indicate a port scan, \
-           or that a service running on this host has crashed. \
-           Netdata will not send a clear notification for this alarm.
+     info: Average number of sent TCP RESETS over the last 10 seconds. \
+           A high number of sent TCP RESETS. \
+           This can indicate a port scan or that a service running on the system has crashed.
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -63,7 +62,8 @@ component: Network
     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
    delay: up 20s down 60m multiplier 1.2 max 2h
  options: no-clear-notification
-     info: average number of received TCP RESETS over the last 10 seconds. \
-           This can be an indication that a service this host needs has crashed. \
-           Netdata will not send a clear notification for this alarm.
+     info: Average number of received TCP RESETS over the last 10 seconds. \
+           A high number of received TCP RESETS. \
+           This can indicate that the system is trying to establish a connection \
+           to a server port on which no process is listening.
       to: sysadmin
--- a/health/health.d/timex.conf
+++ b/health/health.d/timex.conf
@ -13,5 +13,7 @@ component: Clock
    every: 10s
     warn: $system.uptime.uptime > 17 * 60 AND $this == 0
    delay: down 5m
-     info: the system time is not synchronized to a reliable server
+     info: The system time is not synchronized to a reliable server. \
+           It is strongly recommended having the clock in sync with NTP servers because otherwise, \
+           it leads to unpredictable problems that are difficult to debug.
       to: silent
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@ -15,7 +15,10 @@ component: Network
    units: errors
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
-     info: average number of UDP receive buffer errors over the last minute
+     info: Average number of UDP receive buffer errors over the last minute. \
+           UDP receive buffer is full. \
+           The system is dropping incoming UDP packets. \
+           To increase the limit, adjust [net.ipv4.udp_rmem].
    delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin

@ -33,6 +36,9 @@ component: Network
    units: errors
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
-     info: average number of UDP send buffer errors over the last minute
+    info: Average number of UDP send buffer errors over the last minute. \
+           UDP send buffer is full or no kernel memory available. \
+           The system is dropping outgoing UDP packets. \
+           To increase the limit, adjust [net.ipv4.udp_wmem].
    delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@ -11,7 +11,10 @@ component: Unbound
    every: 10s
     warn: $this > 5
    delay: up 10 down 5m multiplier 1.5 max 1h
-     info: number of overwritten queries in the request-list
+     info: Number of overwritten queries. \
+           Request queue is full. Unbound is overwriting old queued requests. \
+           It can indicate a Denial of Service attack. \
+           To increase the queue length, adjust [num-queries-per-thread].
       to: sysadmin

 template: unbound_request_list_dropped
@ -24,5 +27,8 @@ component: Unbound
    every: 10s
     warn: $this > 0
    delay: up 10 down 5m multiplier 1.5 max 1h
-     info: number of dropped queries in the request-list
+     info: Number of dropped queries. \
+           Request queue is full. Unbound is dropping new incoming requests. \
+           It can indicate a Denial of Service attack. \
+           To increase the queue length, adjust [num-queries-per-thread].
       to: sysadmin
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@ -17,8 +17,11 @@ component: VMware vCenter
     warn: ($this == 1) || ($this == 2)
     crit: $this == 3
    delay: down 1m multiplier 1.5 max 1h
-     info: overall system health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Overall system health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           Overall system status is unhealthy. \
+           It indicates that one or more components might become overloaded soon (yellow), \
+           or might be degraded (orange), or might be in an unusable status and the appliance might become unresponsive soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 # Components health:
@ -39,8 +42,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: swap health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Swap health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The swap component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 template: vcsa_storage_health
@ -54,8 +59,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: storage health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Storage health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The storage component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 template: vcsa_mem_health
@ -69,8 +76,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: memory health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Memory health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The memory component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 template: vcsa_load_health
@ -84,8 +93,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: load health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Load health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The load component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 template: vcsa_database_storage_health
@ -99,8 +110,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: database storage health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Database storage health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The database storage component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin

 template: vcsa_applmgmt_health
@ -114,8 +127,10 @@ component: VMware vCenter
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-     info: applmgmt health status \
-           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+     info: Applmgmt health status (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey). \
+           The appliance management component is unhealthy. \
+           It indicates that the component has minor (yellow) or severe (orange) problems, or will stop functioning soon (red). \
+           To view the details, check the VCSA Health Messages pane.
       to: sysadmin


@ -136,6 +151,7 @@ component: VMware vCenter
     warn: $this == 4
     crit: $this == 3
    delay: down 1m multiplier 1.5 max 1h
-     info: software updates availability status \
-           (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
+     info: Software updates availability status (-1: unknown, 0: green, 2: orange, 3: red, 4: grey). \
+           Software updates might be available. \
+           It indicates that security patches might be available (red) or an error retrieving information on software updates (grey).
       to: sysadmin
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@ -26,7 +26,10 @@ component: VerneMQ
    every: 1m
     warn: $this > (($status >= $WARNING) ? (0) : (5))
    delay: up 2m down 5m multiplier 1.5 max 2h
-     info: number of dropped messaged due to full queues in the last minute
+     info: Number of dropped messages due to full queues in the last minute. \
+           Message queues are full. VerneMQ is dropping messages. \
+           This can indicate that consumers or VerneMQ are too slow, or publishers are too fast. \
+           To increase the queue length, adjust [max_online_messages].
       to: sysadmin

 template: vernemq_queue_message_expired
@ -39,7 +42,8 @@ component: VerneMQ
    every: 1m
     warn: $this > (($status >= $WARNING) ? (0) : (5))
    delay: up 2m down 5m multiplier 1.5 max 2h
-     info: number of messages which expired before delivery in the last minute
+     info: Number of messages which expired before delivery in the last minute. \
+           Too many messages were undelivered due to expiration.
       to: sysadmin

 template: vernemq_queue_message_unhandled
@ -52,7 +56,8 @@ component: VerneMQ
    every: 1m
     warn: $this > (($status >= $WARNING) ? (0) : (5))
    delay: up 2m down 5m multiplier 1.5 max 2h
-     info: number of unhandled messages (connections with clean session=true) in the last minute
+     info: Number of unhandled messages in the last minute. \
+           Too many messages were undelivered due to connections with clean session=true.
       to: sysadmin

 # Erlang VM
@ -83,7 +88,11 @@ component: VerneMQ
    every: 1m
     warn: $this > 0
    delay: up 5m down 5m multiplier 1.5 max 1h
-     info: amount of traffic dropped during communication with the cluster nodes in the last minute
+     info: Amount of traffic dropped during communication with the cluster nodes in the last minute. \
+           The outgoing cluster buffer is full. \
+           VerneMQ is experiencing problems with inter-node message delivery. \
+           Most likely a remote node is down or unreachable. \
+           To increase the buffer size, adjust [outgoing_clustering_buffer_size].
       to: sysadmin

 template: vernemq_netsplits
@ -96,7 +105,10 @@ component: VerneMQ
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 2h
-     info: number of detected netsplits (split brain situation) in the last minute
+     info: Number of detected netsplits in the last minute. \
+           Split-brain detected. \
+           That is mostly the result of a failure of one or more network devices \
+           resulting in a cluster where nodes can no longer reach each other.
       to: sysadmin

 # Unsuccessful CONNACK
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@ -9,5 +9,8 @@ component: WHOIS
    every: 60s
     warn: $this < $days_until_expiration_warning*24*60*60
     crit: $this < $days_until_expiration_critical*24*60*60
-     info: time until the domain name registration expires
+     info: Time until the domain name registration expires. \
+           The domain name registration expires soon. \
+           If you do not renew the domain it will be deactivated soon. \
+           Consider scheduling the renewal.
       to: webmaster
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@ -14,7 +14,8 @@ component: CPU
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-     info: average CPU utilization over the last 10 minutes
+     info: Average CPU utilization over the last 10 minutes. \
+           High CPU utilization.
       to: sysadmin


@ -33,7 +34,8 @@ component: Memory
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: memory utilization
+     info: Percentage of used RAM memory. \
+           High RAM utilization.
       to: sysadmin

 template: wmi_swap_in_use
@ -49,7 +51,8 @@ component: Memory
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: swap memory utilization
+     info: Percentage of used swap. \
+           High swap memory utilization.
       to: sysadmin


@ -68,7 +71,8 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of inbound discarded packets for the network interface in the last 10 minutes
+     info: Number of inbound discarded packets for the network interface in the last 10 minutes. \
+           The network interface discarded many inbound packets.
       to: sysadmin

 template: wmi_outbound_packets_discarded
@ -84,7 +88,8 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of outbound discarded packets for the network interface in the last 10 minutes
+     info: Number of outbound discarded packets for the network interface in the last 10 minutes. \
+           The network interface discarded many outbound packets.
       to: sysadmin

 template: wmi_inbound_packets_errors
@ -100,7 +105,8 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of inbound errors for the network interface in the last 10 minutes
+     info: Number of inbound errors for the network interface in the last 10 minutes. \
+           The network interface received many bad packets.
       to: sysadmin

 template: wmi_outbound_packets_errors
@ -116,7 +122,8 @@ component: Network
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-     info: number of outbound errors for the network interface in the last 10 minutes
+     info: Number of outbound errors for the network interface in the last 10 minutes. \
+           The network interface experienced many transmit problems.
       to: sysadmin


@ -135,5 +142,6 @@ component: Disk
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-     info: disk space utilization
+     info: Percentage of used disk space. \
+           High disk space utilization.
       to: sysadmin
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@ -9,7 +9,9 @@ component: x509 certificates
    every: 60s
     warn: $this < $days_until_expiration_warning*24*60*60
     crit: $this < $days_until_expiration_critical*24*60*60
-     info: time until x509 certificate expires
+     info: Time until the X.509 certificate expires. \
+           The X.509 certificate will expire soon. \
+           Renew the certificate or create a new one.
       to: webmaster
      
 template: x509check_revocation_status
@ -20,5 +22,6 @@ component: x509 certificates
     calc: $revoked
    every: 60s
     crit: $this != nan AND $this != 0
-     info: x509 certificate revocation status (0: revoked, 1: valid)
+     info: X.509 certificate revocation status (0: revoked, 1: valid). \
+           The X.509 certificate has been revoked.
       to: webmaster