Skip to content

Commit

Permalink
fix node-exporter alerts (#389)
Browse files Browse the repository at this point in the history
to show node hostname instead of ip address

Signed-off-by: Andrei Kvapil <[email protected]>

Signed-off-by: Andrei Kvapil <[email protected]>
  • Loading branch information
kvaps authored Oct 3, 2024
1 parent b605c85 commit 4eaca42
Showing 1 changed file with 49 additions and 49 deletions.
98 changes: 49 additions & 49 deletions packages/system/monitoring/alerts/node-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
Expand All @@ -25,12 +25,12 @@ spec:
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
Expand All @@ -45,12 +45,12 @@ spec:
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
Expand All @@ -63,12 +63,12 @@ spec:
for: 30m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
Expand All @@ -81,12 +81,12 @@ spec:
for: 30m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
Expand All @@ -101,12 +101,12 @@ spec:
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
Expand All @@ -121,12 +121,12 @@ spec:
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
Expand All @@ -139,12 +139,12 @@ spec:
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
Expand All @@ -157,11 +157,11 @@ spec:
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
Expand All @@ -170,11 +170,11 @@ spec:
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
Expand All @@ -183,7 +183,7 @@ spec:
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
Expand All @@ -194,22 +194,22 @@ spec:
> 0.75
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector on {{ $labels.instance }} failed
description: Node Exporter text file collector on {{ $labels.node }} failed
to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeClockSkewDetected
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
description: Clock at {{ $labels.node }} is out of sync by more than 0.05s.
Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
Expand All @@ -228,11 +228,11 @@ spec:
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeClockNotSynchronising
annotations:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure
description: Clock at {{ $labels.node }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
Expand All @@ -243,11 +243,11 @@ spec:
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
description: RAID array '{{ $labels.device }}' at {{ $labels.node }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
Expand All @@ -258,23 +258,23 @@ spec:
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array at {{ $labels.instance }} failed.
description: At least one device in RAID array at {{ $labels.node }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array.
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
> 0
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
description: File descriptors limit at {{ $labels.node }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
Expand All @@ -285,11 +285,11 @@ spec:
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
description: File descriptors limit at {{ $labels.node }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
Expand All @@ -300,25 +300,25 @@ spec:
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeCPUHighUsage
annotations:
description: |
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
CPU usage at {{ $labels.node }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeSystemSaturation
annotations:
description: |
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
System load per core at {{ $labels.node }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
This might indicate this instance resources saturation and can cause it becoming unresponsive.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: System saturated, load per core is very high.
Expand All @@ -328,38 +328,38 @@ spec:
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeMemoryMajorPagesFaults
annotations:
description: |
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
Memory major pages are occurring at very high rate at {{ $labels.node }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
Please check that there is enough memory available at this instance.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
summary: Memory major page faults are occurring at very high rate.
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeMemoryHighUtilization
annotations:
description: |
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
Memory is filling up at {{ $labels.node }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Host is running out of memory.
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 90
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
exported_instance: '{{ $labels.node }}'
service: node-exporter
- alert: NodeDiskIOSaturation
annotations:
description: |
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.node }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
This symptom might indicate disk saturation.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: Disk IO queue is high.
Expand All @@ -368,29 +368,29 @@ spec:
for: 30m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeSystemdServiceFailed
annotations:
description: Systemd service {{ $labels.name }} has entered failed state at
{{ $labels.instance }}
{{ $labels.node }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.name }}'
exported_instance: '{{ $labels.node }}/{{ $labels.name }}'
service: node-exporter
- alert: NodeBondingDegraded
annotations:
description: Bonding interface {{ $labels.master }} on {{ $labels.instance
description: Bonding interface {{ $labels.master }} on {{ $labels.node
}} is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.master }}'
exported_instance: '{{ $labels.node }}/{{ $labels.master }}'
service: node-exporter

0 comments on commit 4eaca42

Please sign in to comment.