diff --git a/README.md b/README.md index 8291695..9a39779 100644 --- a/README.md +++ b/README.md @@ -153,16 +153,20 @@ Examples: | total=2 firing=1 pending=0 inactive=1 Flags: - --exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex. - -h, --help help for alert - -n, --name strings The name of one or more specific alerts to check. - This parameter can be repeated e.G.: '--name alert1 --name alert2' - If no name is given, all alerts will be evaluated - -g, --group strings The name of one or more specific groups to check. - This parameter can be repeated e.G.: '--group group1 --group group2' - If no group is given, all groups will be scanned for alerts - -T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK") - -P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed + --exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex. + --exclude-label stringArray The label of one or more specific alerts to exclude. + This parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example' + -g, --group strings The name of one or more specific groups to check for alerts. + This parameter can be repeated e.g.: '--group group1 --group group2' + If no group is given, all groups will be scanned for alerts + -h, --help help for alert + --include-label stringArray The label of one or more specific alerts to include. + This parameter can be repeated e.g.: '--include-label prio=high --include-label another=example' + -n, --name strings The name of one or more specific alerts to check. + This parameter can be repeated e.g.: '--name alert1 --name alert2' + If no name is given, all alerts will be evaluated + -T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK") + -P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed ``` #### Checking all defined alerts diff --git a/cmd/alert.go b/cmd/alert.go index 4e2bbd3..5bc0239 100644 --- a/cmd/alert.go +++ b/cmd/alert.go @@ -4,12 +4,14 @@ import ( "errors" "fmt" "regexp" + "slices" "strings" "github.com/NETWAYS/check_prometheus/internal/alert" "github.com/NETWAYS/go-check" "github.com/NETWAYS/go-check/perfdata" "github.com/NETWAYS/go-check/result" + "github.com/prometheus/common/model" "github.com/spf13/cobra" ) @@ -17,23 +19,14 @@ type AlertConfig struct { AlertName []string Group []string ExcludeAlerts []string + ExcludeLabels []string + IncludeLabels []string ProblemsOnly bool NoAlertsState string } var cliAlertConfig AlertConfig -func contains(s string, list []string) bool { - // Tiny helper to see if a string is in a list of strings - for _, elem := range list { - if s == elem { - return true - } - } - - return false -} - var alertCmd = &cobra.Command{ Use: "alert", Short: "Checks the status of a Prometheus alert", @@ -111,31 +104,47 @@ inactive = 0`, var overall result.Overall - for _, rl := range rules { + includeLabels := sliceToMap(cliAlertConfig.IncludeLabels) + excludeLabels := sliceToMap(cliAlertConfig.ExcludeLabels) + for _, rl := range rules { // If it's not the Alert we're looking for, Skip! if cliAlertConfig.AlertName != nil { - if !contains(rl.AlertingRule.Name, cliAlertConfig.AlertName) { + if !slices.Contains(cliAlertConfig.AlertName, rl.AlertingRule.Name) { continue } } + labelsMatchedInclude := matchesLabel(rl.AlertingRule.Labels, includeLabels) + + if len(cliAlertConfig.IncludeLabels) > 0 && !labelsMatchedInclude { + // If the alert labels don't match here we can skip it. + continue + } + // Skip inactive alerts if flag is set if len(rl.AlertingRule.Alerts) == 0 && cliAlertConfig.ProblemsOnly { continue } - alertMatched, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts) + alertMatchedExclude, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts) if regexErr != nil { check.ExitRaw(check.Unknown, "Invalid regular expression provided:", regexErr.Error()) } - if alertMatched { + if alertMatchedExclude { // If the alert matches a regex from the list we can skip it. continue } + labelsMatchedExclude := matchesLabel(rl.AlertingRule.Labels, excludeLabels) + + if len(cliAlertConfig.ExcludeLabels) > 0 && labelsMatchedExclude { + // If the alert labels matches here we can skip it. + continue + } + // Handle Inactive Alerts if len(rl.AlertingRule.Alerts) == 0 { // Counting states for perfdata @@ -218,18 +227,27 @@ func init() { fs.StringVarP(&cliAlertConfig.NoAlertsState, "no-alerts-state", "T", "OK", "State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK") - fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{}, "Alerts to ignore. Can be used multiple times and supports regex.") + fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{}, + "Alerts to ignore. Can be used multiple times and supports regex.") fs.StringSliceVarP(&cliAlertConfig.AlertName, "name", "n", nil, "The name of one or more specific alerts to check."+ - "\nThis parameter can be repeated e.G.: '--name alert1 --name alert2'"+ + "\nThis parameter can be repeated e.g.: '--name alert1 --name alert2'"+ "\nIf no name is given, all alerts will be evaluated") fs.StringSliceVarP(&cliAlertConfig.Group, "group", "g", nil, "The name of one or more specific groups to check for alerts."+ - "\nThis parameter can be repeated e.G.: '--group group1 --group group2'"+ + "\nThis parameter can be repeated e.g.: '--group group1 --group group2'"+ "\nIf no group is given, all groups will be scanned for alerts") + fs.StringArrayVar(&cliAlertConfig.IncludeLabels, "include-label", []string{}, + "The label of one or more specific alerts to include."+ + "\nThis parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'") + + fs.StringArrayVar(&cliAlertConfig.ExcludeLabels, "exclude-label", []string{}, + "The label of one or more specific alerts to exclude."+ + "\nThis parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'") + fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false, "Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed") } @@ -267,3 +285,29 @@ func matches(input string, regexToExclude []string) (bool, error) { return false, nil } + +// Matches a list of labels against a list of labels +func matchesLabel(labels model.LabelSet, labelsToMatch map[string]string) bool { + for k, v := range labels { + if dv, ok := labelsToMatch[string(k)]; ok && dv == string(v) { + return true + } + } + + return false +} + +func sliceToMap(labels []string) map[string]string { + m := make(map[string]string, len(labels)) + + for _, s := range labels { + kv := strings.SplitN(s, "=", 2) + if len(kv) != 2 { + continue + } + + m[kv[0]] = kv[1] + } + + return m +} diff --git a/cmd/alert_test.go b/cmd/alert_test.go index cb40d5e..ee8a448 100644 --- a/cmd/alert_test.go +++ b/cmd/alert_test.go @@ -234,6 +234,60 @@ exit status 2 args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert"}, expected: "[OK] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [OK] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\n", }, + { + name: "alert-include-label", + server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(loadTestdata(alertTestDataSet1)) + })), + args: []string{"run", "../main.go", "alert", "--include-label", "severity=critical"}, + expected: `[CRITICAL] - 2 Alerts: 1 Firing - 0 Pending - 1 Inactive +\_ [OK] [HostOutOfMemory] is inactive +\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"} +|total=2 firing=1 pending=0 inactive=1 + +exit status 2 +`, + }, + { + name: "alert-exclude-label", + server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(loadTestdata(alertTestDataSet1)) + })), + args: []string{"run", "../main.go", "alert", "--exclude-label", "severity=critical"}, + expected: `[WARNING] - 1 Alerts: 0 Firing - 1 Pending - 0 Inactive +\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"} +|total=1 firing=0 pending=1 inactive=0 + +exit status 1 +`, + }, + { + name: "alert-include-label-multiple", + server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(loadTestdata(alertTestDataSet1)) + })), + args: []string{"run", "../main.go", "alert", "--include-label", "team=database", "--include-label", "severity=critical"}, + expected: `[CRITICAL] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive +\_ [OK] [HostOutOfMemory] is inactive +\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"} +\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"} +|total=3 firing=1 pending=1 inactive=1 + +exit status 2 +`, + }, + { + name: "alert-exclude-label-multiple", + server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(loadTestdata(alertTestDataSet1)) + })), + args: []string{"run", "../main.go", "alert", "--exclude-label", "team=database", "--exclude-label", "severity=critical"}, + expected: "[OK] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive\n\\_ [OK] No alerts retrieved\n|total=0 firing=0 pending=0 inactive=0\n\n", + }, } for _, test := range tests { diff --git a/testdata/unittest/alertDataset1.json b/testdata/unittest/alertDataset1.json index 7e46bff..ef5ebc5 100644 --- a/testdata/unittest/alertDataset1.json +++ b/testdata/unittest/alertDataset1.json @@ -12,7 +12,8 @@ "query": "up", "duration": 120, "labels": { - "severity": "critical" + "severity": "critical", + "team": "network" }, "annotations": { "description": "Foo", @@ -40,7 +41,8 @@ "query": "mysql", "duration": 17280000, "labels": { - "severity": "warning" + "severity": "warning", + "team": "database" }, "annotations": { "description": "MySQL", @@ -84,7 +86,8 @@ "query": "SSL", "duration": 0, "labels": { - "severity": "critical" + "severity": "critical", + "team": "network" }, "annotations": { "description": "TLS",