fix node-exporter alerts (#389)

to show node hostname instead of ip address Signed-off-by: Andrei Kvapil <[email protected]> Signed-off-by: Andrei Kvapil <[email protected]>
aenix-io · Oct 3, 2024 · 4eaca42 · 4eaca42
1 parent b605c85
commit 4eaca42
Showing 1 changed file with 49 additions and 49 deletions.
diff --git a/packages/system/monitoring/alerts/node-exporter.yaml b/packages/system/monitoring/alerts/node-exporter.yaml
@@ -10,7 +10,7 @@ spec:
     - alert: NodeFilesystemSpaceFillingUp
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           space left and is filling up.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
         summary: Filesystem is predicted to run out of space within the next 24 hours.
@@ -25,12 +25,12 @@ spec:
       for: 1h
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemSpaceFillingUp
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           space left and is filling up fast.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
         summary: Filesystem is predicted to run out of space within the next 4 hours.
@@ -45,12 +45,12 @@ spec:
       for: 1h
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemAlmostOutOfSpace
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           space left.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
         summary: Filesystem has less than 5% space left.
@@ -63,12 +63,12 @@ spec:
       for: 30m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemAlmostOutOfSpace
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           space left.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
         summary: Filesystem has less than 3% space left.
@@ -81,12 +81,12 @@ spec:
       for: 30m
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemFilesFillingUp
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           inodes left and is filling up.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
         summary: Filesystem is predicted to run out of inodes within the next 24 hours.
@@ -101,12 +101,12 @@ spec:
       for: 1h
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemFilesFillingUp
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           inodes left and is filling up fast.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
         summary: Filesystem is predicted to run out of inodes within the next 4 hours.
@@ -121,12 +121,12 @@ spec:
       for: 1h
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemAlmostOutOfFiles
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           inodes left.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
         summary: Filesystem has less than 5% inodes left.
@@ -139,12 +139,12 @@ spec:
       for: 1h
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFilesystemAlmostOutOfFiles
       annotations:
         description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
-          }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
+          }}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
           inodes left.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
         summary: Filesystem has less than 3% inodes left.
@@ -157,11 +157,11 @@ spec:
       for: 1h
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeNetworkReceiveErrs
       annotations:
-        description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
           {{ printf "%.0f" $value }} receive errors in the last two minutes.'
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
         summary: Network interface is reporting many receive errors.
@@ -170,11 +170,11 @@ spec:
       for: 1h
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeNetworkTransmitErrs
       annotations:
-        description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
           {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
         summary: Network interface is reporting many transmit errors.
@@ -183,7 +183,7 @@ spec:
       for: 1h
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeHighNumberConntrackEntriesUsed
       annotations:
@@ -194,22 +194,22 @@ spec:
         > 0.75
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeTextFileCollectorScrapeError
       annotations:
-        description: Node Exporter text file collector on {{ $labels.instance }} failed
+        description: Node Exporter text file collector on {{ $labels.node }} failed
           to scrape.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
         summary: Node Exporter text file collector failed to scrape.
       expr: node_textfile_scrape_error{job="node-exporter"} == 1
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeClockSkewDetected
       annotations:
-        description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
+        description: Clock at {{ $labels.node }} is out of sync by more than 0.05s.
           Ensure NTP is configured correctly on this host.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
         summary: Clock skew detected.
@@ -228,11 +228,11 @@ spec:
       for: 10m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeClockNotSynchronising
       annotations:
-        description: Clock at {{ $labels.instance }} is not synchronising. Ensure
+        description: Clock at {{ $labels.node }} is not synchronising. Ensure
           NTP is configured on this host.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
         summary: Clock not synchronising.
@@ -243,11 +243,11 @@ spec:
       for: 10m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeRAIDDegraded
       annotations:
-        description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
+        description: RAID array '{{ $labels.device }}' at {{ $labels.node }} is
           in degraded state due to one or more disks failures. Number of spare drives
           is insufficient to fix issue automatically.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
@@ -258,23 +258,23 @@ spec:
       for: 15m
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeRAIDDiskFailure
       annotations:
-        description: At least one device in RAID array at {{ $labels.instance }} failed.
+        description: At least one device in RAID array at {{ $labels.node }} failed.
           Array '{{ $labels.device }}' needs attention and possibly a disk swap.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
         summary: Failed device in RAID array.
       expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
         > 0
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeFileDescriptorLimit
       annotations:
-        description: File descriptors limit at {{ $labels.instance }} is currently
+        description: File descriptors limit at {{ $labels.node }} is currently
           at {{ printf "%.2f" $value }}%.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
         summary: Kernel is predicted to exhaust file descriptors limit soon.
@@ -285,11 +285,11 @@ spec:
       for: 15m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeFileDescriptorLimit
       annotations:
-        description: File descriptors limit at {{ $labels.instance }} is currently
+        description: File descriptors limit at {{ $labels.node }} is currently
           at {{ printf "%.2f" $value }}%.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
         summary: Kernel is predicted to exhaust file descriptors limit soon.
@@ -300,25 +300,25 @@ spec:
       for: 15m
       labels:
         severity: critical
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeCPUHighUsage
       annotations:
         description: |
-          CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
+          CPU usage at {{ $labels.node }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
         summary: High CPU usage.
       expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
         mode!="idle"}[2m]))) * 100 > 90
       for: 15m
       labels:
         severity: informational
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeSystemSaturation
       annotations:
         description: |
-          System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
+          System load per core at {{ $labels.node }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
           This might indicate this instance resources saturation and can cause it becoming unresponsive.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
         summary: System saturated, load per core is very high.
@@ -328,38 +328,38 @@ spec:
       for: 15m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeMemoryMajorPagesFaults
       annotations:
         description: |
-          Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
+          Memory major pages are occurring at very high rate at {{ $labels.node }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
           Please check that there is enough memory available at this instance.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
         summary: Memory major page faults are occurring at very high rate.
       expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
       for: 15m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeMemoryHighUtilization
       annotations:
         description: |
-          Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
+          Memory is filling up at {{ $labels.node }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
         summary: Host is running out of memory.
       expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
         * 100) > 90
       for: 15m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}'
+        exported_instance: '{{ $labels.node }}'
         service: node-exporter
     - alert: NodeDiskIOSaturation
       annotations:
         description: |
-          Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
+          Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.node }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
           This symptom might indicate disk saturation.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
         summary: Disk IO queue is high.
@@ -368,29 +368,29 @@ spec:
       for: 30m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
         service: node-exporter
     - alert: NodeSystemdServiceFailed
       annotations:
         description: Systemd service {{ $labels.name }} has entered failed state at
-          {{ $labels.instance }}
+          {{ $labels.node }}
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
         summary: Systemd service has entered failed state.
       expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
       for: 5m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.name }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.name }}'
         service: node-exporter
     - alert: NodeBondingDegraded
       annotations:
-        description: Bonding interface {{ $labels.master }} on {{ $labels.instance
+        description: Bonding interface {{ $labels.master }} on {{ $labels.node
           }} is in degraded state due to one or more slave failures.
         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
         summary: Bonding interface is degraded
       expr: (node_bonding_slaves - node_bonding_active) != 0
       for: 5m
       labels:
         severity: warning
-        exported_instance: '{{ $labels.instance }}/{{ $labels.master }}'
+        exported_instance: '{{ $labels.node }}/{{ $labels.master }}'
         service: node-exporter