This is a draft document that was built and uploaded automatically. It may document beta software and be incomplete or even incorrect. Use this document at your own risk.

Jump to contentJump to page navigation: previous page [access key p]/next page [access key n]
Administration Guide / Default Alerts for SUSE Enterprise Storage
Applies to SUSE Enterprise Storage 5.5 (SES 5 & SES 5.5)

B Default Alerts for SUSE Enterprise Storage

groups:
 - name: cluster health
  rules:
   - alert: health error
    expr: ceph_health_status == 2
    for: 5m
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: Ceph in error for > 5m
   - alert: unhealthy
    expr: ceph_health_status != 0
    for: 15m
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: Ceph not healthy for > 5m
 - name: mon
  rules:
   - alert: low monitor quorum count
    expr: ceph_monitor_quorum_count < 3
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: Monitor count in quorum is low
 - name: osd
  rules:
   - alert: 10% OSDs down
    expr: sum(ceph_osd_down) / count(ceph_osd_in) >= 0.1
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: More then 10% of OSDS are down
   - alert: OSD down
    expr: sum(ceph_osd_down) > 1
    for: 15m
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: One or more OSDS down for more then 15 minutes
   - alert: OSDs near full
    expr: (ceph_osd_utilization unless on(osd) ceph_osd_down) > 80
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: OSD {{ $labels.osd }} is dangerously full, over 80%
   # alert on single OSDs flapping
   - alert: flap osd
    expr: rate(ceph_osd_up[5m])*60 > 1
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
       OSD {{ $label.osd }} was marked down at back up at least once a
       minute for 5 minutes.
   # alert on high deviation from average PG count
   - alert: high pg count deviation
    expr: abs(((ceph_osd_pgs > 0) - on (job) group_left avg(ceph_osd_pgs > 0) by (job)) / on (job) group_left avg(ceph_osd_pgs > 0) by (job)) > 0.35
    for: 5m
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
       OSD {{ $labels.osd }} deviates by more then 30% from
       average PG count
   # alert on high commit latency...but how high is too high
 - name: mds
  rules:
  # no mds metrics are exported yet
 - name: mgr
  rules:
  # no mgr metrics are exported yet
 - name: pgs
  rules:
   - alert: pgs inactive
    expr: ceph_total_pgs - ceph_active_pgs > 0
    for: 5m
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: One or more PGs are inactive for more then 5 minutes.
   - alert: pgs unclean
    expr: ceph_total_pgs - ceph_clean_pgs > 0
    for: 15m
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: One or more PGs are not clean for more then 15 minutes.
 - name: nodes
  rules:
   - alert: root volume full
    expr: node_filesystem_avail{mountpoint="/"} / node_filesystem_size{mountpoint="/"} < 0.1
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: Root volume (OSD and MON store) is dangerously full (< 10% free)
   # alert on nic packet errors and drops rates > 1 packet/s
   - alert: network packets dropped
    expr: irate(node_network_receive_drop{device!="lo"}[5m]) + irate(node_network_transmit_drop{device!="lo"}[5m]) > 1
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
      Node {{ $labels.instance }} experiences packet drop > 1
      packet/s on interface {{ $lables.device }}
   - alert: network packet errors
    expr: irate(node_network_receive_errs{device!="lo"}[5m]) + irate(node_network_transmit_errs{device!="lo"}[5m]) > 1
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
      Node {{ $labels.instance }} experiences packet errors > 1
      packet/s on interface {{ $lables.device }}
   # predict fs fillup times
   - alert: storage filling
    expr: ((node_filesystem_free - node_filesystem_size) / deriv(node_filesystem_free[2d]) <= 5) > 0
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
      Mountpoint {{ $lables.mountpoint }} will be full in less then 5 days
      assuming the average fillup rate of the past 48 hours.
 - name: pools
  rules:
   - alert: pool full
    expr: ceph_pool_used_bytes / ceph_pool_available_bytes > 0.9
    labels:
     severity: critical
     type: ses_default
    annotations:
     description: Pool {{ $labels.pool }} at 90% capacity or over
   - alert: pool filling up
    expr: (-ceph_pool_used_bytes / deriv(ceph_pool_available_bytes[2d]) <= 5 ) > 0
    labels:
     severity: warning
     type: ses_default
    annotations:
     description: >
      Pool {{ $labels.pool }} will be full in less then 5 days
      assuming the average fillup rate of the past 48 hours.