Commit 70c1a5e0 authored by Dmitry Volodin's avatar Dmitry Volodin Committed by Andrey Vertiprahov
Browse files

Alarm Groups: min threshold

parent af2c47ff
......@@ -14,8 +14,8 @@
"description": "Alarm Group name"
}
],
"subject_template": "Group Alarm {{alarm.vars[\"name\"]}} - {{alarm.vars[\"title\"]}}",
"body_template": "Alarm by group {{alarm.vars[\"name\"]}} ",
"subject_template": "Group Alarm {{alarm.vars.name]}}",
"body_template": "Alarm by group {{alarm.vars.name]}}",
"symptoms": null,
"probable_causes": null,
"recommended_actions": "Ignore this"
......
......@@ -75,6 +75,7 @@ class ActiveAlarm(Document):
"labels",
"effective_labels",
"groups",
"deferred_groups",
],
}
status = "A"
......@@ -109,6 +110,8 @@ class ActiveAlarm(Document):
root = ObjectIdField(required=False)
# Group alarm references
groups = ListField(BinaryField())
# Groups waiting to min_threshold quorum
deferred_groups = ListField(BinaryField())
# Escalated TT ID in form
# <external system name>:<external tt id>
escalation_ts = DateTimeField(required=False)
......
......@@ -18,6 +18,7 @@ from mongoengine.fields import (
BooleanField,
ListField,
LongField,
IntField,
ReferenceField,
EmbeddedDocumentField,
)
......@@ -53,6 +54,10 @@ class Group(EmbeddedDocument):
alarm_class = PlainReferenceField(AlarmClass)
# Group Title template
title_template = StringField(default="")
# Minimum amount of alarms to create the group
min_threshold = IntField(default=0)
# Correlation window in seconds to check min_threshold
window = IntField(default=0)
def __str__(self):
return f'{self.alarm_class or ""}/{self.title_template or ""}: {self.reference_template}'
......
......@@ -35,6 +35,8 @@ class Group(object):
alarm_class: AlarmClass
title_template: Template
labels: Optional[List[str]] = None
min_threshold: int = 0
window: int = 0
@dataclass
......@@ -43,6 +45,8 @@ class GroupItem(object):
alarm_class: AlarmClass
title: str
labels: Optional[List[str]] = None
min_threshold: int = 0
window: int = 0
class AlarmRule(object):
......@@ -76,6 +80,8 @@ class AlarmRule(object):
if group.alarm_class
else cls.get_default_alarm_class(),
title_template=Template(group.title_template),
min_threshold=group.min_threshold or 0,
window=group.window or 0,
)
)
return rule
......@@ -122,6 +128,8 @@ class AlarmRule(object):
alarm_class=group.alarm_class,
title=group.title_template.render(**ctx),
labels=group.labels,
min_threshold=group.min_threshold,
window=group.window,
)
......
......@@ -12,7 +12,7 @@ import datetime
import re
from collections import defaultdict
import threading
from typing import Union, Any, Iterable, Optional, Dict, List, Set
from typing import Union, Any, Iterable, Optional, Tuple, Dict, List, Set
import operator
from itertools import chain
from hashlib import sha512
......@@ -476,8 +476,9 @@ class CorrelatorService(TornadoService):
for gi in rule.iter_groups(a):
if gi.reference and gi.reference not in alarm_groups:
alarm_groups[gi.reference] = gi
all_groups = await self.get_groups(a, alarm_groups.values())
all_groups, deferred_groups = await self.get_groups(a, alarm_groups.values())
a.groups = [g.reference for g in all_groups]
a.deferred_groups = deferred_groups
# Save
a.save()
if event:
......@@ -1079,18 +1080,79 @@ class CorrelatorService(TornadoService):
correlate_uplinks(a)
self.logger.debug("[%s] Correlation completed", alarm.id)
def get_group_deferred_count(
self, h_ref: bytes, min_ts: datetime.datetime, max_ts: datetime.datetime
) -> int:
"""
Get amount of waiting alarms for reference
"""
for doc in ActiveAlarm._get_collection().aggregate(
[
{
"$match": {
"deferred_groups": h_ref,
"timestamp": {"$gte": min_ts, "$lte": max_ts},
}
},
{"$group": {"_id": None, "def_count": {"$sum": 1}}},
{"$project": {"_id": 0}},
]
):
return doc.get("def_count", 0) or 0
return 0
def resolve_deferred_groups(self, h_ref: bytes) -> None:
"""
Mark all resolved groups as permanent
"""
ActiveAlarm._get_collection().update_many(
{"deferred_group": h_ref},
{"$push": {"groups": h_ref}, "$pullAll": {"deferred_groups": [h_ref]}},
)
# Reset affected cached values
with ref_lock:
deprecated: List[str] = [
a_ref
for a_ref, alarm in self._reference_cache.items()
if alarm.deferred_groups and h_ref in alarm.deferred_groups
]
for a_ref in deprecated:
del self._reference_cache[a_ref]
async def get_groups(
self, alarm: ActiveAlarm, groups: Iterable[GroupItem]
) -> List[ActiveAlarm]:
) -> Tuple[List[ActiveAlarm], List[bytes]]:
"""
Resolve all groups and create when necessary
:param alarm: Active Alarm to match groups
:param groups: Iterable of group configurations
:returns: Tuple of list of active group alarms
and the list of the deferred group references
"""
r: List[ActiveAlarm] = []
active: List[ActiveAlarm] = []
deferred: List[bytes] = []
for group in groups:
if group.reference == alarm.raw_reference:
continue # Reference cycle
def_h_ref: Optional[bytes] = None
# Fetch or raise group alarm
g_alarm = self.get_by_reference(group.reference)
if not g_alarm:
if group.min_threshold > 0 and group.window > 0:
# Check group has enough deferred alarms to raise thresholds
h_ref = self.get_reference_hash(group.reference)
w_delta = datetime.timedelta(seconds=group.window)
n_waiting = self.get_group_deferred_count(
h_ref, alarm.timestamp - w_delta, alarm.timestamp + w_delta
)
if n_waiting < group.min_threshold - 1:
# Below the threshold, set group as deferred
deferred.append(h_ref)
continue
else:
# Pull deferred alarms later
def_h_ref = h_ref
# Raise group alarm
g_alarm = await self.raise_alarm(
managed_object=alarm.managed_object,
......@@ -1100,12 +1162,14 @@ class CorrelatorService(TornadoService):
reference=group.reference,
labels=group.labels,
)
if g_alarm:
if not g_alarm:
# Update cache
self._reference_cache[group.reference] = g_alarm
if g_alarm:
r.append(g_alarm)
return r
active.append(g_alarm)
if def_h_ref:
self.resolve_deferred_groups(def_h_ref)
return active, deferred
async def clear_groups(self, groups: List[bytes], ts: Optional[datetime.datetime]) -> None:
"""
......
......@@ -62,6 +62,24 @@ Ext.define("NOC.fm.alarmrule.Application", {
xtype: "gridfield",
fieldLabel: __("Group Alarm"),
columns: [
{
text: __("Minimum alarms"),
dataIndex: "min_threshold",
editor: {
xtype: "numberfield"
},
minValue: 0,
defaultValue: 0,
},
{
text: __("Window (sec.)"),
dataIndex: "window",
editor: {
xtype: "numberfield"
},
minValue: 0,
defaultValue: 0,
},
{
text: __("Reference Template"),
dataIndex: "reference_template",
......@@ -81,7 +99,7 @@ Ext.define("NOC.fm.alarmrule.Application", {
editor: "textfield",
allowBlank: true,
flex: 1
}
},
]
},
{
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment