Skip to content

Commit a7704eb

Browse files
committed
feat(inputs.lustre2): support reading bulk read/write stats
1 parent a7f0b06 commit a7704eb

File tree

4 files changed

+292
-20
lines changed

4 files changed

+292
-20
lines changed

plugins/inputs/lustre2/README.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,22 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
2222
# This plugin ONLY supports Linux
2323
[[inputs.lustre2]]
2424
## An array of /proc globs to search for Lustre stats
25-
## If not specified, the default will work on Lustre 2.5.x
25+
## If not specified, the default will work on Lustre 2.12.x
2626
##
2727
# ost_procfiles = [
2828
# "/proc/fs/lustre/obdfilter/*/stats",
2929
# "/proc/fs/lustre/osd-ldiskfs/*/stats",
3030
# "/proc/fs/lustre/obdfilter/*/job_stats",
3131
# "/proc/fs/lustre/obdfilter/*/exports/*/stats",
32+
# "/proc/fs/lustre/osd-ldiskfs/*/brw_stats",
33+
# "/proc/fs/lustre/osd-zfs/*/brw_stats",
3234
# ]
3335
# mds_procfiles = [
3436
# "/proc/fs/lustre/mdt/*/md_stats",
3537
# "/proc/fs/lustre/mdt/*/job_stats",
3638
# "/proc/fs/lustre/mdt/*/exports/*/stats",
39+
# "/proc/fs/lustre/osd-ldiskfs/*/brw_stats",
40+
# "/proc/fs/lustre/osd-zfs/*/brw_stats",
3741
# ]
3842
```
3943

plugins/inputs/lustre2/lustre2.go

+168-18
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package lustre2
99

1010
import (
1111
_ "embed"
12+
"fmt"
1213
"os"
1314
"path/filepath"
1415
"regexp"
@@ -23,7 +24,7 @@ import (
2324
var sampleConfig string
2425

2526
type tags struct {
26-
name, job, client string
27+
name, brwSection, bucket, job, client string
2728
}
2829

2930
// Lustre proc files can change between versions, so we want to future-proof
@@ -32,7 +33,7 @@ type Lustre2 struct {
3233
OstProcfiles []string `toml:"ost_procfiles"`
3334
MdsProcfiles []string `toml:"mds_procfiles"`
3435

35-
// allFields maps and OST name to the metric fields associated with that OST
36+
// allFields maps an OST name to the metric fields associated with that OST
3637
allFields map[tags]map[string]interface{}
3738
}
3839

@@ -49,6 +50,29 @@ type mapping struct {
4950
reportAs string // What measurement name to use
5051
}
5152

53+
var wantedBrwstatsFields = []*mapping{
54+
{
55+
inProc: "pages per bulk r/w",
56+
reportAs: "pages_per_bulk_rw",
57+
},
58+
{
59+
inProc: "discontiguous pages",
60+
reportAs: "discontiguous_pages",
61+
},
62+
{
63+
inProc: "disk I/Os in flight",
64+
reportAs: "disk_ios_in_flight",
65+
},
66+
{
67+
inProc: "I/O time (1/1000s)",
68+
reportAs: "io_time",
69+
},
70+
{
71+
inProc: "disk I/O size",
72+
reportAs: "disk_io_size",
73+
},
74+
}
75+
5276
var wantedOstFields = []*mapping{
5377
{
5478
inProc: "write_bytes",
@@ -408,10 +432,10 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) e
408432
}
409433

410434
var fields map[string]interface{}
411-
fields, ok := l.allFields[tags{name, jobid, client}]
435+
fields, ok := l.allFields[tags{name, "", "", jobid, client}]
412436
if !ok {
413437
fields = make(map[string]interface{})
414-
l.allFields[tags{name, jobid, client}] = fields
438+
l.allFields[tags{name, "", "", jobid, client}] = fields
415439
}
416440

417441
for _, wanted := range wantedFields {
@@ -440,6 +464,100 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) e
440464
return nil
441465
}
442466

467+
func (l *Lustre2) getLustreProcBrwStats(fileglob string, wantedFields []*mapping) error {
468+
files, err := filepath.Glob(fileglob)
469+
if err != nil {
470+
return fmt.Errorf("failed to find files matching glob %s: %w", fileglob, err)
471+
}
472+
473+
for _, file := range files {
474+
// todo get client too
475+
// Turn /proc/fs/lustre/obdfilter/<ost_name>/stats and similar into just the object store target name
476+
// This assumes that the target name is always second to last, which is true in Lustre 2.1->2.12
477+
path := strings.Split(file, "/")
478+
if len(path) < 2 {
479+
continue
480+
}
481+
name := path[len(path)-2]
482+
483+
wholeFile, err := os.ReadFile(file)
484+
if err != nil {
485+
return fmt.Errorf("failed to read file %s: %w", file, err)
486+
}
487+
lines := strings.Split(string(wholeFile), "\n")
488+
489+
var headerName string
490+
for _, line := range lines {
491+
// There are four types of lines in a brw_stats file:
492+
// 1. Header lines - contain the category of metric (e.g. disk I/Os in flight, disk I/O time)
493+
// 2. Bucket lines - follow headers, contain the bucket value (e.g. 4K, 1M) and metric values
494+
// 3. Empty lines - these will simply be filtered out
495+
// 4. snapshot_time line - this will be filtered out, as it "looks" like a bucket line
496+
if len(line) < 1 {
497+
continue
498+
}
499+
parts := strings.Fields(line)
500+
501+
// This is a header line
502+
// Set report name for use by the buckets that follow
503+
if strings.Contains(parts[0], ":") == false {
504+
nameParts := strings.Split(line, " ")
505+
headerName = nameParts[0]
506+
continue
507+
}
508+
509+
// snapshot_time should be discarded
510+
if strings.Contains(parts[0], "snapshot_time") {
511+
continue
512+
}
513+
514+
// This is a bucket for a given header
515+
for _, wanted := range wantedFields {
516+
if headerName != wanted.inProc {
517+
continue
518+
}
519+
bucket := strings.TrimSuffix(parts[0], ":")
520+
521+
// brw_stats columns are static and don't need configurable fields
522+
readIos, err := strconv.ParseUint(parts[1], 10, 64)
523+
if err != nil {
524+
return fmt.Errorf("failed to parse read_ios: %w", err)
525+
}
526+
readPercent, err := strconv.ParseUint(parts[2], 10, 64)
527+
if err != nil {
528+
return fmt.Errorf("failed to parse read_percent: %w", err)
529+
}
530+
writeIos, err := strconv.ParseUint(parts[5], 10, 64)
531+
if err != nil {
532+
return fmt.Errorf("failed to parse write_ios: %w", err)
533+
}
534+
writePercent, err := strconv.ParseUint(parts[6], 10, 64)
535+
if err != nil {
536+
return fmt.Errorf("failed to parse write_percent: %w", err)
537+
}
538+
reportName := headerName
539+
if wanted.reportAs != "" {
540+
reportName = wanted.reportAs
541+
}
542+
543+
// todo pass client?
544+
tag := tags{name, reportName, bucket, "", ""}
545+
fields, ok := l.allFields[tag]
546+
if !ok {
547+
fields = make(map[string]interface{})
548+
l.allFields[tag] = fields
549+
}
550+
551+
fields["read_ios"] = readIos
552+
fields["read_percent"] = readPercent
553+
fields["write_ios"] = writeIos
554+
fields["write_percent"] = writePercent
555+
}
556+
}
557+
}
558+
return nil
559+
}
560+
443561
// Gather reads stats from all lustre targets
444562
func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
445563
l.allFields = make(map[tags]map[string]interface{})
@@ -460,6 +578,16 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
460578
if err != nil {
461579
return err
462580
}
581+
// bulk read/wrote statistics for ldiskfs
582+
err = l.getLustreProcBrwStats("/proc/fs/lustre/osd-ldiskfs/*/brw_stats", wantedBrwstatsFields)
583+
if err != nil {
584+
return err
585+
}
586+
// bulk read/write statistics for zfs
587+
err = l.getLustreProcBrwStats("/proc/fs/lustre/osd-zfs/*/brw_stats", wantedBrwstatsFields)
588+
if err != nil {
589+
return err
590+
}
463591
}
464592

465593
if len(l.MdsProcfiles) == 0 {
@@ -477,30 +605,52 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
477605
}
478606

479607
for _, procfile := range l.OstProcfiles {
480-
ostFields := wantedOstFields
481-
if strings.HasSuffix(procfile, "job_stats") {
482-
ostFields = wantedOstJobstatsFields
483-
}
484-
err := l.GetLustreProcStats(procfile, ostFields)
485-
if err != nil {
486-
return err
608+
if strings.HasSuffix(procfile, "brw_stats") {
609+
err := l.getLustreProcBrwStats(procfile, wantedBrwstatsFields)
610+
if err != nil {
611+
return err
612+
}
613+
} else if strings.HasSuffix(procfile, "job_stats") {
614+
err := l.GetLustreProcStats(procfile, wantedOstJobstatsFields)
615+
if err != nil {
616+
return err
617+
}
618+
} else {
619+
err := l.GetLustreProcStats(procfile, wantedOstFields)
620+
if err != nil {
621+
return err
622+
}
487623
}
488624
}
489625
for _, procfile := range l.MdsProcfiles {
490-
mdtFields := wantedMdsFields
491-
if strings.HasSuffix(procfile, "job_stats") {
492-
mdtFields = wantedMdtJobstatsFields
493-
}
494-
err := l.GetLustreProcStats(procfile, mdtFields)
495-
if err != nil {
496-
return err
626+
if strings.HasSuffix(procfile, "brw_stats") {
627+
err := l.getLustreProcBrwStats(procfile, wantedBrwstatsFields)
628+
if err != nil {
629+
return err
630+
}
631+
} else if strings.HasSuffix(procfile, "job_stats") {
632+
err := l.GetLustreProcStats(procfile, wantedMdtJobstatsFields)
633+
if err != nil {
634+
return err
635+
}
636+
} else {
637+
err := l.GetLustreProcStats(procfile, wantedMdsFields)
638+
if err != nil {
639+
return err
640+
}
497641
}
498642
}
499643

500644
for tgs, fields := range l.allFields {
501645
tags := map[string]string{
502646
"name": tgs.name,
503647
}
648+
if len(tgs.brwSection) > 0 {
649+
tags["brw_section"] = tgs.brwSection
650+
}
651+
if len(tgs.bucket) > 0 {
652+
tags["bucket"] = tgs.bucket
653+
}
504654
if len(tgs.job) > 0 {
505655
tags["jobid"] = tgs.job
506656
}

0 commit comments

Comments
 (0)