Hi Isaac,

I've achieved this by writing a service check (which I call a "cluster
check") which looks at the states of the individual checks and returns
ok, warn, crit or unknown accordingly. I have notifications enabled
for the cluster check, but not for the individual checks, so that if 1
or more (or some percentage) of individual checks fail, a single
notification will get sent out for "the cluster."

 Here's what one of my cluster checks looks like, the thing to pay
attention to is the logic inside the "for (h in get_objects(Host))"
loop, particularly the vars.cluster_state function:

for (cl in all_clusters) {
apply Service "check cluster location service throttled" use(cl) {
  var dummyhost = get_host(cl)
    import "generic-service"
    check_command = "dummy"
    notes = "restart the service using svc -t, escalate if it doesn’t clear"
    action_url = "location_service_throttled.txt"
    check_interval = 3m
    retry_interval = 1m
    max_check_attempts = 3
    assign where host.name == cl &&
host.vars.check_location_service_throttled == "true"
    vars.alert_contact = dummyhost.vars.alert_contact
    vars.ec2_instances = []
    vars.threshold = ""
    vars.total = 0

    for (h in get_objects(Host)) {
     if (h.vars.check_location_service_throttled == "true" &&
h.vars.range == cl && h.vars.host_type != "dummy") {
        vars.ec2_instances.add(h.name)
        vars.threshold = if
(h.vars.check_location_service_throttled_cluster_threshold) {
         h.vars.check_location_service_throttled_cluster_threshold
        } else {
          90
        }
     }
    }

    vars.total = len(vars.ec2_instances)

    vars.cluster_state = {{
     var dummy_state = 3
     var num_ok = 0
     var num_warn = 0
     var num_crit = 0
     var num_unk = 0
     var total = macro("$total$")

     for (instance in macro("$ec2_instances$")) {
       var st =  get_service(instance, "check location service
throttled" ).state
       var ty =  get_service(instance, "check location service
throttled" ).state_type
       var ht = get_host(instance).state
       if (st == 0 && ht == 0) {
         num_ok += 1
       } else if (st == 1 && ht == 0) {
         num_warn += 1
       } else if (st == 2 && ty == 1 && ht == 0) {
         num_crit += 1
       } else if (st == 2 && ty == 0 && ht == 0) {
         num_ok += 1
       } else {
         num_unk += 1
       }
     }
   if (total == 0) {
     dummy_state = 3
   } else {
     if (num_crit*100/total > 80) {
       dummy_state = 2
     } else {
       dummy_state = 0
     }
   }

   return dummy_state
  }}

    vars.cluster_text = {{
     var num_ok = 0
     var num_warn = 0
     var num_crit = 0
     var num_unk = 0
     var instances_crit = []
     var instances_unk = []
     var instances_warn = []

     for (instance in macro("$ec2_instances$")) {
       var st =  get_service(instance, "check location service throttled").state
       var ty =  get_service(instance, "check location service
throttled").state_type
       var ht = get_host(instance).state
       if (st == 0 && ht == 0) {
         num_ok += 1
       } else if (st == 1 && ht == 0) {
         instances_warn.add(instance)
       } else if (st == 2 && ty == 1 && ht == 0) {
         num_crit += 1
         instances_crit.add(instance)
       } else if (st == 2 && ty == 0 && ht == 0) {
         num_ok += 1
       } else {
         num_unk += 1
         instances_unk.add(instance)
       }
     }

   return num_ok + " hosts OK, " + num_warn + " hosts WARN, " +
num_crit + " host CRIT, " + num_unk + " hosts UNKNOWN. List of
critical instances: " + instances_crit.join(" , ") + " List of unknown
instances: " + instances_unk.join(" , ") + " List of warning
instances: " + instances_warn.join(" , ")
  }}

   vars.dummy_state = "$cluster_state$"
   vars.dummy_text = "$cluster_text$"

} // end object


On Wed, Nov 30, 2016 at 10:49 AM, isaac rodriguez
<isaac.rodrig...@focusae.com> wrote:
> Hello all,
>
> I am using Icinga2 to do some service monitoring across a number of nodes,
> and had a question about aggregating notifications. Suppose I had some
> service running on all my machines and for whatever reason, the service goes
> down on all my hosts. Is there anyway to send out one notification as
> opposed to a notification for each host the service went down on?
>
> Thanks,
> Isaac
> _______________________________________________
> icinga-users mailing list
> icinga-users@lists.icinga.org
> https://lists.icinga.org/mailman/listinfo/icinga-users



-- 
---
Michael Martinez
http://www.michael--martinez.com
_______________________________________________
icinga-users mailing list
icinga-users@lists.icinga.org
https://lists.icinga.org/mailman/listinfo/icinga-users

Reply via email to