|
|
|
@ -1,7 +1,7 @@
@@ -1,7 +1,7 @@
|
|
|
|
|
groups: |
|
|
|
|
- name: basic |
|
|
|
|
rules: |
|
|
|
|
- alert: InstanceDown |
|
|
|
|
- alert: InstanceDownFromKnyft |
|
|
|
|
expr: up{job="node"} == 0 |
|
|
|
|
for: 10m |
|
|
|
|
labels: |
|
|
|
@ -26,70 +26,38 @@ groups:
@@ -26,70 +26,38 @@ groups:
|
|
|
|
|
annotations: |
|
|
|
|
description: '{{ $labels.instance }} has a clock offset > 1 second.' |
|
|
|
|
summary: '{{ $labels.instance }} has clock drift.' |
|
|
|
|
- alert: DiskWillFillIn8Hours |
|
|
|
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0 |
|
|
|
|
for: 2h |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} will fill up within 8 hours |
|
|
|
|
summary: '{{ $labels.instance }} disk full' |
|
|
|
|
- alert: DiskWillFillIn72Hours |
|
|
|
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0 |
|
|
|
|
for: 8h |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} will fill up within 72 hours |
|
|
|
|
summary: '{{ $labels.instance }} disk almost full' |
|
|
|
|
- alert: DiskFull |
|
|
|
|
expr: node_filesystem_free{job="node",mountpoint!~"/net|/cvmfs.*|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse"} < 5.24288e+06 |
|
|
|
|
for: 15m |
|
|
|
|
- alert: high iowait |
|
|
|
|
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[10m])) * 100) > 98 |
|
|
|
|
for: 10m |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}. |
|
|
|
|
summary: '{{ $labels.instance }} Disk full' |
|
|
|
|
- alert: tmpFull |
|
|
|
|
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880 |
|
|
|
|
for: 30m |
|
|
|
|
description: '{{ $labels.instance }} has high iowait.' |
|
|
|
|
summary: '{{ $labels.instance }} has high iowait.' |
|
|
|
|
- alert: extreme load |
|
|
|
|
expr: node_load1 > 300 |
|
|
|
|
for: 5m |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} Has a full /tmp |
|
|
|
|
summary: '{{ $labels.instance }} /tmp full' |
|
|
|
|
description: '{{ $labels.instance }} high load.' |
|
|
|
|
summary: '{{ $labels.instance }} load > 300' |
|
|
|
|
- alert: DiskWillFillIn8Hours |
|
|
|
|
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0 |
|
|
|
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse.*"}[2h], 8 * 3600) < 0 |
|
|
|
|
for: 2h |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} will fill up within 8 hours |
|
|
|
|
summary: '{{ $labels.instance }} disk full' |
|
|
|
|
- alert: DiskWillFillIn72Hours |
|
|
|
|
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0 |
|
|
|
|
for: 8h |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} will fill up within 72 hours |
|
|
|
|
summary: '{{ $labels.instance }} disk almost full' |
|
|
|
|
- alert: DiskFull |
|
|
|
|
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse?"} < 5.24288e+06 |
|
|
|
|
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.*"} < 5.24288e+06 |
|
|
|
|
for: 5m |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}. |
|
|
|
|
summary: '{{ $labels.instance }} Disk full' |
|
|
|
|
- alert: tmpFull |
|
|
|
|
expr: node_filesystem_free_bytes{job="node",mountpoint="/tmp"} < 5242880 |
|
|
|
|
for: 30m |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} Has a full /tmp |
|
|
|
|
summary: '{{ $labels.instance }} /tmp full' |
|
|
|
|
summary: '{{ $labels.instance }} {{ $labels.mountpoint }} Disk full' |
|
|
|
|
- alert: NodeRebooted |
|
|
|
|
expr: delta(node_boot_time[1h]) > 10 |
|
|
|
|
for: 1m |
|
|
|
@ -98,6 +66,14 @@ groups:
@@ -98,6 +66,14 @@ groups:
|
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} has been rebooted. |
|
|
|
|
summary: '{{ $labels.instance }} rebooted' |
|
|
|
|
- alert: LustreHealthNotOK |
|
|
|
|
expr: lustre_health_check != 1 |
|
|
|
|
for: 1m |
|
|
|
|
labels: |
|
|
|
|
severity: page |
|
|
|
|
annotations: |
|
|
|
|
description: Instance {{ $labels.instance }} lustre down!. |
|
|
|
|
summary: '{{ $labels.instance }} lustre down' |
|
|
|
|
- name: gpu |
|
|
|
|
rules: |
|
|
|
|
- alert: UncorrectedECC |
|
|
|
@ -169,7 +145,7 @@ groups:
@@ -169,7 +145,7 @@ groups:
|
|
|
|
|
annotations: |
|
|
|
|
description: '{{ $labels.instance }}: Has high load.' |
|
|
|
|
summary: '{{ $labels.instance }} has high load.' |
|
|
|
|
- alert: LoadOSS |
|
|
|
|
- alert: oss01_high_load |
|
|
|
|
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss01.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75 |
|
|
|
|
for: 10m |
|
|
|
|
labels: |
|
|
|
@ -177,7 +153,7 @@ groups:
@@ -177,7 +153,7 @@ groups:
|
|
|
|
|
annotations: |
|
|
|
|
description: '{{ $labels.instance }}: Has high load.' |
|
|
|
|
summary: '{{ $labels.instance }} has high load.' |
|
|
|
|
- alert: LoadOSS |
|
|
|
|
- alert: oss02_high_load |
|
|
|
|
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss02.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75 |
|
|
|
|
for: 10m |
|
|
|
|
labels: |
|
|
|
@ -241,7 +217,6 @@ groups:
@@ -241,7 +217,6 @@ groups:
|
|
|
|
|
annotations: |
|
|
|
|
description: Merlin Ceph is in Warn state longer than 30m, please check status of pools and OSDs |
|
|
|
|
summary: CEPH in WARN |
|
|
|
|
# |
|
|
|
|
# - alert: Node drained. |
|
|
|
|
# expr: min_over_time(slurm_nodes_drain{job="slurm_exorter"}[2h]) < slurm_nodes_drain{job="slurm_exorter"} |
|
|
|
|
# for: 1h |
|
|
|
|