Browse Source

New extreme load alert.

pull/24/head
Egon Rijpkema 5 months ago
parent
commit
53f9a22938
  1. 73
      roles/prom_server/templates/etc/alerting.rules

73
roles/prom_server/templates/etc/alerting.rules

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
groups:
- name: basic
rules:
- alert: InstanceDown
- alert: InstanceDownFromKnyft
expr: up{job="node"} == 0
for: 10m
labels:
@ -26,70 +26,38 @@ groups: @@ -26,70 +26,38 @@ groups:
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/net|/cvmfs.*|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse"} < 5.24288e+06
for: 15m
- alert: high iowait
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[10m])) * 100) > 98
for: 10m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
description: '{{ $labels.instance }} has high iowait.'
summary: '{{ $labels.instance }} has high iowait.'
- alert: extreme load
expr: node_load1 > 300
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
description: '{{ $labels.instance }} high load.'
summary: '{{ $labels.instance }} load > 300'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse.*"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse?"} < 5.24288e+06
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.*"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free_bytes{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
summary: '{{ $labels.instance }} {{ $labels.mountpoint }} Disk full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
@ -98,6 +66,14 @@ groups: @@ -98,6 +66,14 @@ groups:
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
- alert: LustreHealthNotOK
expr: lustre_health_check != 1
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} lustre down!.
summary: '{{ $labels.instance }} lustre down'
- name: gpu
rules:
- alert: UncorrectedECC
@ -169,7 +145,7 @@ groups: @@ -169,7 +145,7 @@ groups:
annotations:
description: '{{ $labels.instance }}: Has high load.'
summary: '{{ $labels.instance }} has high load.'
- alert: LoadOSS
- alert: oss01_high_load
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss01.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75
for: 10m
labels:
@ -177,7 +153,7 @@ groups: @@ -177,7 +153,7 @@ groups:
annotations:
description: '{{ $labels.instance }}: Has high load.'
summary: '{{ $labels.instance }} has high load.'
- alert: LoadOSS
- alert: oss02_high_load
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss02.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75
for: 10m
labels:
@ -241,7 +217,6 @@ groups: @@ -241,7 +217,6 @@ groups:
annotations:
description: Merlin Ceph is in Warn state longer than 30m, please check status of pools and OSDs
summary: CEPH in WARN
#
# - alert: Node drained.
# expr: min_over_time(slurm_nodes_drain{job="slurm_exorter"}[2h]) < slurm_nodes_drain{job="slurm_exorter"}
# for: 1h

Loading…
Cancel
Save