From f7987329c0ae0d20930bdb5c0ebe45309a9b62f0 Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Mon, 11 Mar 2024 18:11:11 +0000 Subject: [PATCH] Add watch rule_files subcommand --- .../tests/0167_rule_duplicate_symlink.txt | 1 - cmd/pint/tests/0168_watch_rule_files.txt | 43 +++++++++ cmd/pint/watch.go | 93 ++++++++++++++----- docs/changelog.md | 5 + docs/index.md | 51 +++++++++- internal/checks/alerts_external_labels.go | 2 +- internal/checks/labels_conflict.go | 2 +- internal/checks/promql_rate.go | 2 +- internal/config/prometheus.go | 9 ++ internal/promapi/config.go | 14 ++- internal/promapi/config_test.go | 4 +- internal/promapi/failover.go | 4 +- 12 files changed, 193 insertions(+), 37 deletions(-) create mode 100644 cmd/pint/tests/0168_watch_rule_files.txt diff --git a/cmd/pint/tests/0167_rule_duplicate_symlink.txt b/cmd/pint/tests/0167_rule_duplicate_symlink.txt index 0b79a884..c992a505 100644 --- a/cmd/pint/tests/0167_rule_duplicate_symlink.txt +++ b/cmd/pint/tests/0167_rule_duplicate_symlink.txt @@ -33,7 +33,6 @@ level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up ta -- stderrV2.txt -- -- src/v0.yml -- groups: -groups: - name: g1 rules: - alert: DownAlert diff --git a/cmd/pint/tests/0168_watch_rule_files.txt b/cmd/pint/tests/0168_watch_rule_files.txt new file mode 100644 index 00000000..e5ff2bde --- /dev/null +++ b/cmd/pint/tests/0168_watch_rule_files.txt @@ -0,0 +1,43 @@ +http response prometheus /api/v1/status/config 200 {"status":"success","data":{"yaml":"rule_files:\n - rules/*\n"}} +http response prometheus /api/v1/metadata 200 {"status":"success","data":{}} +http response prometheus /api/v1/status/flags 200 {"status":"success","data":{"storage.tsdb.retention.time": "1d"}} +http response prometheus /api/v1/query_range 200 {"status":"success","data":{"resultType":"matrix","result":[]}} +http response prometheus /api/v1/query 200 {"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1666873962.795,"1"]}]}} +http start prometheus 127.0.0.1:7168 + +exec bash -x ./test.sh & + +pint.ok --no-color -l debug watch --interval=5s --listen=127.0.0.1:6168 --pidfile=pint.pid rule_files prom +! stdout . + +stderr 'level=DEBUG msg="Glob finder completed" count=2' +stderr 'level=DEBUG msg="Glob finder completed" count=3' + +-- test.sh -- +sleep 7 +mv more/*.yaml rules/ +sleep 7 +cat pint.pid | xargs kill + +-- rules/1.yaml -- +groups: +- name: g1 + rules: + - alert: DownAlert1 + expr: up == 0 +-- rules/2.yaml -- +groups: +- name: g2 + rules: + - alert: DownAlert2 + expr: up == 0 +-- more/3.yaml -- +groups: +- name: g2 + rules: + - alert: DownAlert2 + expr: up == 0 +-- .pint.hcl -- +prometheus "prom" { + uri = "http://localhost:7168" +} \ No newline at end of file diff --git a/cmd/pint/watch.go b/cmd/pint/watch.go index 414ea0e1..873f7184 100644 --- a/cmd/pint/watch.go +++ b/cmd/pint/watch.go @@ -42,9 +42,59 @@ var watchCmd = &cli.Command{ Usage: "Run in the foreground and continuesly check specified rules.", Subcommands: []*cli.Command{ { - Name: "glob", - Usage: "Check a list of files or directories (can be a glob).", - Action: actionWatch, + Name: "glob", + Usage: "Check a list of files or directories (can be a glob).", + Action: func(c *cli.Context) error { + meta, err := actionSetup(c) + if err != nil { + return err + } + + paths := c.Args().Slice() + if len(paths) == 0 { + return fmt.Errorf("at least one file or directory required") + } + + slog.Debug("Starting glob watch", slog.Any("paths", paths)) + return actionWatch(c, meta, func(_ context.Context) ([]string, error) { + return paths, nil + }) + }, + }, + { + Name: "rule_files", + Usage: "Check the list of rule files from paths loaded by Prometheus.", + Action: func(c *cli.Context) error { + meta, err := actionSetup(c) + if err != nil { + return err + } + + args := c.Args().Slice() + if len(args) != 1 { + return fmt.Errorf("exactly one argument required with the URI of Prometheus server to query") + } + + gen := config.NewPrometheusGenerator(meta.cfg, prometheus.NewRegistry()) + if err = gen.GenerateStatic(); err != nil { + return err + } + + prom := gen.ServerWithName(args[0]) + if prom == nil { + return fmt.Errorf("no Prometheus named %q configured in pint", args[0]) + } + + slog.Debug("Starting rule_fules watch", slog.String("name", args[0])) + + return actionWatch(c, meta, func(ctx context.Context) ([]string, error) { + cfg, err := prom.Config(ctx, time.Millisecond) + if err != nil { + return nil, fmt.Errorf("failed to query %q Prometheus configuration: %w", prom.Name(), err) + } + return cfg.Config.RuleFiles, nil + }) + }, }, }, Flags: []cli.Flag{ @@ -80,17 +130,7 @@ var watchCmd = &cli.Command{ }, } -func actionWatch(c *cli.Context) error { - meta, err := actionSetup(c) - if err != nil { - return err - } - - paths := c.Args().Slice() - if len(paths) == 0 { - return fmt.Errorf("at least one file or directory required") - } - +func actionWatch(c *cli.Context, meta actionMeta, f pathFinderFunc) error { minSeverity, err := checks.ParseSeverity(c.String(minSeverityFlag)) if err != nil { return fmt.Errorf("invalid --%s value: %w", minSeverityFlag, err) @@ -114,7 +154,7 @@ func actionWatch(c *cli.Context) error { } // start HTTP server for metrics - collector := newProblemCollector(meta.cfg, paths, minSeverity, c.Int(maxProblemsFlag)) + collector := newProblemCollector(meta.cfg, f, minSeverity, c.Int(maxProblemsFlag)) // register all metrics metricsRegistry.MustRegister(collector) metricsRegistry.MustRegister(checkDuration) @@ -221,22 +261,22 @@ func startTimer(ctx context.Context, workers int, isOffline bool, gen *config.Pr } type problemCollector struct { + finder pathFinderFunc cfg config.Config fileOwners map[string]string summary *reporter.Summary problem *prometheus.Desc problems *prometheus.Desc fileOwnersMetric *prometheus.Desc - paths []string minSeverity checks.Severity maxProblems int lock sync.Mutex } -func newProblemCollector(cfg config.Config, paths []string, minSeverity checks.Severity, maxProblems int) *problemCollector { +func newProblemCollector(cfg config.Config, f pathFinderFunc, minSeverity checks.Severity, maxProblems int) *problemCollector { return &problemCollector{ + finder: f, cfg: cfg, - paths: paths, fileOwners: map[string]string{}, problem: prometheus.NewDesc( "pint_problem", @@ -262,8 +302,13 @@ func newProblemCollector(cfg config.Config, paths []string, minSeverity checks.S } func (c *problemCollector) scan(ctx context.Context, workers int, isOffline bool, gen *config.PrometheusGenerator) error { - slog.Info("Finding all rules to check", slog.Any("paths", c.paths)) - entries, err := discovery.NewGlobFinder(c.paths, git.NewPathFilter(nil, nil, c.cfg.Parser.CompileRelaxed())).Find() + paths, err := c.finder(ctx) + if err != nil { + return fmt.Errorf("failed to get the list of paths to check: %w", err) + } + + slog.Info("Finding all rules to check", slog.Any("paths", paths)) + entries, err := discovery.NewGlobFinder(paths, git.NewPathFilter(nil, nil, c.cfg.Parser.CompileRelaxed())).Find() if err != nil { return err } @@ -310,7 +355,11 @@ func (c *problemCollector) Collect(ch chan<- prometheus.Metric) { for _, report := range c.summary.Reports() { if report.Problem.Severity < c.minSeverity { - slog.Debug("Skipping report", slog.String("severity", report.Problem.Severity.String())) + slog.Debug( + "Skipping report with severity lower than minimum configured", + slog.String("severity", report.Problem.Severity.String()), + slog.String("minimum", c.minSeverity.String()), + ) continue } @@ -363,3 +412,5 @@ func (c *problemCollector) Collect(ch chan<- prometheus.Metric) { } } } + +type pathFinderFunc func(ctx context.Context) ([]string, error) diff --git a/docs/changelog.md b/docs/changelog.md index 1c478ba7..7a54e974 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -7,6 +7,11 @@ - [labels/conflict](checks/labels/conflict.md) check will now warn if alerting rule is setting labels that are already configured as external labels. +### Changed + +- `pint watch` command now has two sub-commands: `pint watch glob` and `pint watch rule_files`. + See [watch mode](index.md#watch-mode) docs for details. + ### Fixed - Fixed false positive reports from [rule/duplicate](checks/rule/duplicate.md) when diff --git a/docs/index.md b/docs/index.md index 40e51221..6b7047d0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -173,17 +173,18 @@ then pint will also run "online" checks for it to, for example, ensure all time series used inside your alerting rules are still present. Example config: -```json +```js prometheus "local" { - uri = "http://localhost:9090" + uri = "http://localhost:9090" } ``` #### Getting list of files to check from Prometheus You can also point pint directly at a Prometheus server from the config file. -This will make pint query Prometheus API to get the current value of `rule_files` -Prometheus config option and then run checks on all matching files. +On every iteration, before starting any checks, pint will query Prometheus API +to get the current value of `rule_files` Prometheus config option and then run +checks on all matching files. This way if you test your rules against a running Prometheus instance then you don't need to manually specify any paths or directories. @@ -272,6 +273,48 @@ Here's an example alert you can use for problems detected by pint: {% endraw %} +## YAML parser + +By default pint will expect all Prometheus rule files to be following the exact +syntax Prometheus expects for YAML files containing [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) +and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +rules. +If you have Prometheus rules stored in YAML files with different YAML tree, but still +retain the same set of fields, for example: + +```yaml +# Flat rule list +- alert: AlertName + expr: up == 0 +- record: sum:up + expr: count(up == 1) +``` + +```yaml +# Rules nested under custom tree +service: + prometheus: + rules: + - alert: AlertName + expr: up == 0 + - record: sum:up + expr: count(up == 1) +``` + +You can still check these rules using pint, but you need to switch pint YAML +parser into "relaxed" mode by adding this section to pint config file: + +```js +parser { + relaxed = [ "my/files/*.yml" ] +} +``` + +See [parser](configuration.md#parser) documentation for more details. +"Relaxed" parser mode will load anything that can be parsed as Prometheus rule, +while "strict" parser mode will fail if it reads a file that wouldn't load +cleanly as Prometheus config file. + ## Control comments There is a number of comments you can add to your rule files in order to change diff --git a/internal/checks/alerts_external_labels.go b/internal/checks/alerts_external_labels.go index cdd70506..53f17d0d 100644 --- a/internal/checks/alerts_external_labels.go +++ b/internal/checks/alerts_external_labels.go @@ -52,7 +52,7 @@ func (c AlertsExternalLabelsCheck) Check(ctx context.Context, _ string, rule par return problems } - cfg, err := c.prom.Config(ctx) + cfg, err := c.prom.Config(ctx, 0) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ diff --git a/internal/checks/labels_conflict.go b/internal/checks/labels_conflict.go index 35c2c44a..e4cb7ff2 100644 --- a/internal/checks/labels_conflict.go +++ b/internal/checks/labels_conflict.go @@ -53,7 +53,7 @@ func (c LabelsConflictCheck) Check(ctx context.Context, _ string, rule parser.Ru return problems } - cfg, err := c.prom.Config(ctx) + cfg, err := c.prom.Config(ctx, 0) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning) problems = append(problems, Problem{ diff --git a/internal/checks/promql_rate.go b/internal/checks/promql_rate.go index 3f1717fc..7b3eaae2 100644 --- a/internal/checks/promql_rate.go +++ b/internal/checks/promql_rate.go @@ -67,7 +67,7 @@ func (c RateCheck) Check(ctx context.Context, _ string, rule parser.Rule, entrie return problems } - cfg, err := c.prom.Config(ctx) + cfg, err := c.prom.Config(ctx, 0) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ diff --git a/internal/config/prometheus.go b/internal/config/prometheus.go index 6f23af5a..a8469ff8 100644 --- a/internal/config/prometheus.go +++ b/internal/config/prometheus.go @@ -224,6 +224,15 @@ func (pg *PrometheusGenerator) ServersForPath(path string) []*promapi.FailoverGr return servers } +func (pg *PrometheusGenerator) ServerWithName(name string) *promapi.FailoverGroup { + for _, server := range pg.servers { + if server.Name() == name { + return server + } + } + return nil +} + func (pg *PrometheusGenerator) addServer(server *promapi.FailoverGroup) error { for _, s := range pg.servers { if s.Name() == server.Name() { diff --git a/internal/promapi/config.go b/internal/promapi/config.go index 52da354d..bbf84f00 100644 --- a/internal/promapi/config.go +++ b/internal/promapi/config.go @@ -23,7 +23,8 @@ type ConfigSectionGlobal struct { } type PrometheusConfig struct { - Global ConfigSectionGlobal `yaml:"global"` + Global ConfigSectionGlobal `yaml:"global"` + RuleFiles []string `yaml:"rule_files"` } type ConfigResult struct { @@ -36,6 +37,7 @@ type configQuery struct { prom *Prometheus ctx context.Context timestamp time.Time + cacheTTL time.Duration } func (q configQuery) Run() queryResult { @@ -80,19 +82,23 @@ func (q configQuery) CacheKey() uint64 { } func (q configQuery) CacheTTL() time.Duration { - return time.Minute * 10 + return q.cacheTTL } -func (p *Prometheus) Config(ctx context.Context) (*ConfigResult, error) { +func (p *Prometheus) Config(ctx context.Context, cacheTTL time.Duration) (*ConfigResult, error) { slog.Debug("Scheduling Prometheus configuration query", slog.String("uri", p.safeURI)) key := "/api/v1/status/config" p.locker.lock(key) defer p.locker.unlock(key) + if cacheTTL == 0 { + cacheTTL = time.Minute + } + resultChan := make(chan queryResult) p.queries <- queryRequest{ - query: configQuery{prom: p, ctx: ctx, timestamp: time.Now()}, + query: configQuery{prom: p, ctx: ctx, timestamp: time.Now(), cacheTTL: cacheTTL}, result: resultChan, } diff --git a/internal/promapi/config_test.go b/internal/promapi/config_test.go index 9d597b36..44a4f2b3 100644 --- a/internal/promapi/config_test.go +++ b/internal/promapi/config_test.go @@ -129,7 +129,7 @@ func TestConfig(t *testing.T) { prom.StartWorkers() defer prom.Close() - cfg, err := prom.Config(context.Background()) + cfg, err := prom.Config(context.Background(), time.Minute) if tc.err != "" { require.EqualError(t, err, tc.err, tc) } else { @@ -187,7 +187,7 @@ func TestConfigHeaders(t *testing.T) { fg.StartWorkers(reg) defer fg.Close(reg) - _, err := fg.Config(context.Background()) + _, err := fg.Config(context.Background(), 0) require.NoError(t, err) }) } diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index aeafa1b0..1f7d4c35 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -182,11 +182,11 @@ func (fg *FailoverGroup) CleanCache() { } } -func (fg *FailoverGroup) Config(ctx context.Context) (cfg *ConfigResult, err error) { +func (fg *FailoverGroup) Config(ctx context.Context, cacheTTL time.Duration) (cfg *ConfigResult, err error) { var uri string for _, prom := range fg.servers { uri = prom.safeURI - cfg, err = prom.Config(ctx) + cfg, err = prom.Config(ctx, cacheTTL) if err == nil { return cfg, nil }