Skip to content

Commit

Permalink
Merge pull request #122 from oskoperator/feature/auto-alert-rule
Browse files Browse the repository at this point in the history
feat(magic-alerts): Basic recording rules for alerting
  • Loading branch information
Hy3n4 authored Sep 18, 2024
2 parents df6da67 + 8032d6f commit 78983ff
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 65 deletions.
7 changes: 5 additions & 2 deletions config/samples/openslo_v1_slo.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# TODO: heureka-specific sample, should be replaced in the further-future with a more comprehensive dev environment
# ref.: https://github.com/oskoperator/osko/issues/45
apiVersion: openslo.com/v1
kind: SLO
metadata:
labels:
label.osko.dev/team: "infrastructure"
label.osko.dev/system: "gatekeeper"
label.osko.dev/domain: "security"
annotations:
osko.dev/datasourceRef: "mimir-infra-ds"
osko.dev/magicAlerting: "true"
name: gatekeeper-webhook-response-time
spec:
budgetingMethod: Occurrences
Expand Down
4 changes: 2 additions & 2 deletions docs/labels-and-annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ and what they are used for.

## Labels

### `labels.osko.dev/<key>`
### `label.osko.dev/<key>`

Enables labeling of Prometheus recording and alerting rules, for example for alert routing purposes.

```yaml
labels.osko.dev/team: "infrastructure"
label.osko.dev/team: "infrastructure"
```
## Annotations
Expand Down
38 changes: 27 additions & 11 deletions internal/controller/osko/mimirrule_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,18 +227,34 @@ func (r *MimirRuleReconciler) newMimirClient(connectionDetails *oskov1alpha1.Con

func (r *MimirRuleReconciler) createMimirRuleGroupAPI(log logr.Logger, rule *oskov1alpha1.RuleGroup) error {
var mimirRuleNodes []rulefmt.RuleNode
mimirRuleNode := rulefmt.RuleNode{}
for _, r := range rule.Rules {
mimirRuleNode := rulefmt.RuleNode{
Record: yaml.Node{
Kind: 8,
Value: r.Record,
},
Alert: yaml.Node{},
Expr: yaml.Node{
Kind: 8,
Value: r.Expr,
},
Labels: r.Labels,
if r.Alert == "" {
mimirRuleNode = rulefmt.RuleNode{
Record: yaml.Node{
Kind: 8,
Value: r.Record,
},
Alert: yaml.Node{},
Expr: yaml.Node{
Kind: 8,
Value: r.Expr,
},
Labels: r.Labels,
}
} else {
mimirRuleNode = rulefmt.RuleNode{
Alert: yaml.Node{
Kind: 8,
Value: r.Alert,
},
Expr: yaml.Node{
Kind: 8,
Value: r.Expr,
},
For: r.For,
Labels: r.Labels,
}
}
mimirRuleNodes = append(mimirRuleNodes, mimirRuleNode)
}
Expand Down
26 changes: 22 additions & 4 deletions internal/helpers/mimirtool_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
openslov1 "github.com/oskoperator/osko/api/openslo/v1"
oskov1alpha1 "github.com/oskoperator/osko/api/osko/v1alpha1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/prometheus/common/model"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"reflect"
)
Expand Down Expand Up @@ -68,11 +69,28 @@ func NewMimirRuleGroups(rule *monitoringv1.PrometheusRule, connectionDetails *os
Name: group.Name,
SourceTenants: connectionDetails.SourceTenants,
}
mimirRuleNode := oskov1alpha1.Rule{}

for _, r := range group.Rules {
mimirRuleNode := oskov1alpha1.Rule{
Record: r.Record,
Expr: r.Expr.String(),
Labels: r.Labels,
if r.Record == "" && r.Alert != "" {

duration, err := model.ParseDuration(string(*r.For))
if err != nil {
return nil, err
}

mimirRuleNode = oskov1alpha1.Rule{
Alert: r.Alert,
Expr: r.Expr.String(),
For: duration,
Labels: r.Labels,
}
} else {
mimirRuleNode = oskov1alpha1.Rule{
Record: r.Record,
Expr: r.Expr.String(),
Labels: r.Labels,
}
}
mimirRules = append(mimirRules, mimirRuleNode)
}
Expand Down
198 changes: 152 additions & 46 deletions internal/helpers/prometheus_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ type RuleTemplateData struct {
Labels string
}

// AlertRuleTemplateData holds data to fill the PromQL template for alerting rules.
type AlertRuleTemplateData struct {
Metric string
Service string
Window string
RecordName string
Labels string
For string
}

type MonitoringRuleSet struct {
Slo *openslov1.SLO
Sli *openslov1.SLI
Expand Down Expand Up @@ -224,6 +234,8 @@ func (mrs *MonitoringRuleSet) createRecordingRule(metric, recordName, window str

// SetupRules constructs rule groups for monitoring based on SLO and SLI configurations.
func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) {
//log := ctrllog.FromContext(context.Background())

baseWindow := mrs.BaseWindow //Should configurable somewhere as agreed on product workshop
extendedWindow := "28d" //Default to 28d if not specified in the SLO

Expand All @@ -235,71 +247,165 @@ func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) {
return []monitoringv1.RuleGroup{}, fmt.Errorf("Unsupported metric source type")
}

targetRuleBase := mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", baseWindow, false)
targetRuleExtended := mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", extendedWindow, true)
targetRuleExtendedPageFast := mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", "1h", true)
var rules = map[string]map[string]monitoringv1.Rule{
"targetRule": {},
"totalRule": {},
"goodRule": {},
"badRule": {},
"sliMeasurement": {},
"errorBudgetValue": {},
"errorBudgetTarget": {},
"burnRate": {},
}

windows := []string{baseWindow, extendedWindow, "5m", "30m", "1h", "2h", "6h", "24h", "3d"}
var alertRuleErrorBudgets []monitoringv1.Rule

totalRuleBase := mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Total.MetricSource.Spec.Query, "sli_total", baseWindow, false)
// BASE WINDOW
rules["targetRule"][baseWindow] = mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", baseWindow, false)
rules["totalRule"][baseWindow] = mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Total.MetricSource.Spec.Query, "sli_total", baseWindow, false)

var goodRuleBase monitoringv1.Rule
if mrs.Sli.Spec.RatioMetric.Good.MetricSource.Spec.Query != "" {
goodRuleBase = mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Good.MetricSource.Spec.Query, "sli_good", baseWindow, false)
rules["goodRule"][baseWindow] = mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Good.MetricSource.Spec.Query, "sli_good", baseWindow, false)
} else {
badRuleBase := mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Bad.MetricSource.Spec.Query, "sli_bad", baseWindow, false)
goodRuleBase = mrs.createAntecedentRule(
rules["badRule"][baseWindow] = mrs.createRecordingRule(mrs.Sli.Spec.RatioMetric.Bad.MetricSource.Spec.Query, "sli_bad", baseWindow, false)
rules["goodRule"][baseWindow] = mrs.createAntecedentRule(
fmt.Sprintf("%v - %v",
totalRuleBase.Expr.String(),
badRuleBase.Expr.String(),
rules["totalRule"][baseWindow].Record,
rules["badRule"][baseWindow].Record,
), "sli_good", baseWindow)
}

totalRuleExtended := mrs.createRecordingRule(totalRuleBase.Record, "sli_total", extendedWindow, true)
totalRuleExtendedPageFast := mrs.createRecordingRule(totalRuleBase.Record, "sli_total", "1h", true)
goodRuleExtended := mrs.createRecordingRule(goodRuleBase.Record, "sli_good", extendedWindow, true)
goodRuleExtendedPageFast := mrs.createRecordingRule(goodRuleBase.Record, "sli_good", "1h", true)
rules["sliMeasurement"][baseWindow] = mrs.createSliMeasurementRecordingRule(rules["totalRule"][baseWindow], rules["goodRule"][baseWindow], baseWindow)
rules["errorBudgetValue"][baseWindow] = mrs.createErrorBudgetValueRecordingRule(rules["sliMeasurement"][baseWindow], baseWindow)
rules["errorBudgetTarget"][baseWindow] = mrs.createErrorBudgetTargetRecordingRule(baseWindow)
rules["burnRate"][baseWindow] = mrs.createBurnRateRecordingRule(rules["errorBudgetValue"][baseWindow], rules["errorBudgetTarget"][baseWindow], baseWindow)

sliMeasurementBase := mrs.createSliMeasurementRecordingRule(totalRuleBase, goodRuleBase, baseWindow)
sliMeasurementExtended := mrs.createSliMeasurementRecordingRule(totalRuleExtended, goodRuleExtended, extendedWindow)
sliMeasurementExtendedPageFast := mrs.createSliMeasurementRecordingRule(totalRuleExtendedPageFast, goodRuleExtendedPageFast, "1h")

errorBudgetValueBase := mrs.createErrorBudgetValueRecordingRule(sliMeasurementBase, baseWindow)
errorBudgetValueExtended := mrs.createErrorBudgetValueRecordingRule(sliMeasurementExtended, extendedWindow)
errorBudgetValueExtendedPageFast := mrs.createErrorBudgetValueRecordingRule(sliMeasurementExtendedPageFast, "1h")

errorBudgetTargetBase := mrs.createErrorBudgetTargetRecordingRule(baseWindow)
errorBudgetTargetExtended := mrs.createErrorBudgetTargetRecordingRule(extendedWindow)
errorBudgetTargetExtendedPageFast := mrs.createErrorBudgetTargetRecordingRule("1h")
for _, window := range windows {
if window == baseWindow {
continue
}
// rules["targetRule"][window] = mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", window, true)
rules["totalRule"][window] = mrs.createRecordingRule(rules["totalRule"][baseWindow].Record, "sli_total", window, true)

if mrs.Sli.Spec.RatioMetric.Good.MetricSource.Spec.Query != "" {
rules["goodRule"][window] = mrs.createRecordingRule(rules["goodRule"][baseWindow].Record, "sli_good", window, true)
} else {
rules["badRule"][window] = mrs.createRecordingRule(rules["badRule"][baseWindow].Record, "sli_bad", window, true)
rules["goodRule"][window] = mrs.createAntecedentRule(
fmt.Sprintf("%v - %v",
rules["totalRule"][window].Record,
rules["badRule"][window].Record,
), "sli_good", window)
}

burnRateBase := mrs.createBurnRateRecordingRule(errorBudgetValueBase, errorBudgetTargetBase, baseWindow)
burnRateExtended := mrs.createBurnRateRecordingRule(errorBudgetValueExtended, errorBudgetTargetExtended, extendedWindow)
burnRateExtendedPageFast := mrs.createBurnRateRecordingRule(errorBudgetValueExtendedPageFast, errorBudgetTargetExtendedPageFast, "1h")
rules["sliMeasurement"][window] = mrs.createSliMeasurementRecordingRule(rules["totalRule"][window], rules["goodRule"][window], window)
rules["errorBudgetValue"][window] = mrs.createErrorBudgetValueRecordingRule(rules["sliMeasurement"][window], window)
rules["errorBudgetTarget"][window] = mrs.createErrorBudgetTargetRecordingRule(window)
rules["burnRate"][window] = mrs.createBurnRateRecordingRule(rules["errorBudgetValue"][window], rules["errorBudgetTarget"][window], window)
if window == "5m" || window == "30m" || window == "1h" || window == "6h" || window == "24h" || window == "3d" || window == "2h" {
alertRuleErrorBudgets = append(alertRuleErrorBudgets, rules["burnRate"][window])
}
}

targetRules := []monitoringv1.Rule{targetRuleBase, targetRuleExtended, targetRuleExtendedPageFast}
goodRules := []monitoringv1.Rule{goodRuleBase, goodRuleExtended, goodRuleExtendedPageFast}
totalRules := []monitoringv1.Rule{totalRuleBase, totalRuleExtended, totalRuleExtendedPageFast}
sliMeasurementRules := []monitoringv1.Rule{sliMeasurementBase, sliMeasurementExtended, sliMeasurementExtendedPageFast}
errorBudgetRules := []monitoringv1.Rule{
errorBudgetValueBase,
errorBudgetValueExtended,
errorBudgetValueExtendedPageFast,
errorBudgetTargetBase,
errorBudgetTargetExtended,
errorBudgetTargetExtendedPageFast,
rulesByType := make(map[string][]monitoringv1.Rule)
for ruleKey, nestedMap := range rules {
for _, window := range windows {
if rule, exists := nestedMap[window]; exists {
rulesByType[ruleKey] = append(rulesByType[ruleKey], rule)
}
}
}
burnRateRules := []monitoringv1.Rule{burnRateBase, burnRateExtended, burnRateExtendedPageFast}

sloName := mrs.Slo.Name
ruleGroups := []monitoringv1.RuleGroup{
{Name: fmt.Sprintf("%s_slo_target", sloName), Rules: targetRules},
{Name: fmt.Sprintf("%s_sli_good", sloName), Rules: goodRules},
{Name: fmt.Sprintf("%s_sli_total", sloName), Rules: totalRules},
{Name: fmt.Sprintf("%s_sli_measurement", sloName), Rules: sliMeasurementRules},
{Name: fmt.Sprintf("%s_error_budget", sloName), Rules: errorBudgetRules},
{Name: fmt.Sprintf("%s_burn_rate", sloName), Rules: burnRateRules},
{Name: fmt.Sprintf("%s_slo_target", sloName), Rules: rulesByType["targetRule"]},
{Name: fmt.Sprintf("%s_sli_good", sloName), Rules: rulesByType["goodRule"]},
{Name: fmt.Sprintf("%s_sli_total", sloName), Rules: rulesByType["totalRule"]},
{Name: fmt.Sprintf("%s_sli_measurement", sloName), Rules: rulesByType["sliMeasurement"]},
{Name: fmt.Sprintf("%s_error_budget", sloName), Rules: rulesByType["errorBudgetValue"]},
{Name: fmt.Sprintf("%s_burn_rate", sloName), Rules: rulesByType["burnRate"]},
}

if mrs.Slo.ObjectMeta.Annotations["osko.dev/magicAlerting"] == "true" {
duration := monitoringv1.Duration("5m")
var alertRules []monitoringv1.Rule
alertRules = append(alertRules, mrs.createMagicMultiBurnRateAlert(alertRuleErrorBudgets, "0.001", &duration, "page"))
alertRules = append(alertRules, mrs.createMagicMultiBurnRateAlert(alertRuleErrorBudgets, "0.001", &duration, "ticket"))
ruleGroups = append(ruleGroups, monitoringv1.RuleGroup{
Name: fmt.Sprintf("%s_slo_alert", sloName), Rules: alertRules,
})
}
return ruleGroups, nil
}

// createMagicMultiBurnRateAlert creates a Prometheus alert rule for multi-burn rate alerting.
func (mrs *MonitoringRuleSet) createMagicMultiBurnRateAlert(burnRates []monitoringv1.Rule, threshold string, duration *monitoringv1.Duration, severity string) monitoringv1.Rule {
log := ctrllog.FromContext(context.Background())
cfg := config.NewConfig()

alertingPageWindowsOrder := []string{"1h", "5m", "6h", "30m", "24h", "2h", "3d"}

alertingPageWindows := map[string]monitoringv1.Rule{
"1h": {},
"5m": {},
"6h": {},
"30m": {},
"24h": {},
"2h": {},
"3d": {},
}

// Populate the alertingPageWindows map with the actual burn rates
for _, br := range burnRates {
if _, exists := alertingPageWindows[br.Labels["window"]]; exists {
alertingPageWindows[br.Labels["window"]] = br
}
}

// Define the alert expressions for different severities and durations
var alertExpression string
if severity == "page" {
alertExpression = fmt.Sprintf(
"(%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s)) or (%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s))",
alertingPageWindows[alertingPageWindowsOrder[2]].Record, mapToColonSeparatedString(burnRates[2].Labels), cfg.AlertingBurnRates.PageShortWindow, threshold,
alertingPageWindows[alertingPageWindowsOrder[0]].Record, mapToColonSeparatedString(burnRates[0].Labels), cfg.AlertingBurnRates.PageShortWindow, threshold,
alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), cfg.AlertingBurnRates.PageLongWindow, threshold,
alertingPageWindows[alertingPageWindowsOrder[1]].Record, mapToColonSeparatedString(burnRates[1].Labels), cfg.AlertingBurnRates.PageLongWindow, threshold,
)
} else if severity == "ticket" {
alertExpression = fmt.Sprintf(
"(%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s)) or (%s{%s} > %.3f and %s{%s} > %.3f)",
alertingPageWindows[alertingPageWindowsOrder[4]].Record, mapToColonSeparatedString(burnRates[4].Labels), cfg.AlertingBurnRates.TicketShortWindow, threshold,
alertingPageWindows[alertingPageWindowsOrder[5]].Record, mapToColonSeparatedString(burnRates[5].Labels), cfg.AlertingBurnRates.TicketShortWindow, threshold,
alertingPageWindows[alertingPageWindowsOrder[6]].Record, mapToColonSeparatedString(burnRates[6].Labels), cfg.AlertingBurnRates.TicketLongWindow,
alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), cfg.AlertingBurnRates.TicketLongWindow,
)
}

if alertExpression == "" {
log.Info("Creation of alerting rule failed", "EmptyExpression", "Failed to create the alert expression, expression is empty")
return monitoringv1.Rule{}
}

return monitoringv1.Rule{
Alert: fmt.Sprintf("%s_alert_%s", burnRates[0].Record, severity),
Expr: intstr.FromString(alertExpression),
For: duration,
Labels: map[string]string{
"severity": severity,
},
Annotations: map[string]string{
"summary": "SLO Burn Rate Alert",
"description": fmt.Sprintf("The burn rate of the SLO %s is higher than the %s threshold", mrs.Slo.Name, threshold),
},
}
}

func CreateAlertingRule() (*monitoringv1.PrometheusRule, error) {
return nil, nil
}

func CreatePrometheusRule(slo *openslov1.SLO, sli *openslov1.SLI) (*monitoringv1.PrometheusRule, error) {
cfg := config.NewConfig()
baseWindow := cfg.DefaultBaseWindow.String()
Expand Down

0 comments on commit 78983ff

Please sign in to comment.