@@ -9,6 +9,7 @@ package lustre2
9
9
10
10
import (
11
11
_ "embed"
12
+ "fmt"
12
13
"os"
13
14
"path/filepath"
14
15
"regexp"
@@ -23,7 +24,7 @@ import (
23
24
var sampleConfig string
24
25
25
26
type tags struct {
26
- name , job , client string
27
+ name , brwSection , bucket , job , client string
27
28
}
28
29
29
30
// Lustre proc files can change between versions, so we want to future-proof
@@ -32,7 +33,7 @@ type Lustre2 struct {
32
33
OstProcfiles []string `toml:"ost_procfiles"`
33
34
MdsProcfiles []string `toml:"mds_procfiles"`
34
35
35
- // allFields maps and OST name to the metric fields associated with that OST
36
+ // allFields maps an OST name to the metric fields associated with that OST
36
37
allFields map [tags ]map [string ]interface {}
37
38
}
38
39
@@ -49,6 +50,29 @@ type mapping struct {
49
50
reportAs string // What measurement name to use
50
51
}
51
52
53
+ var wantedBrwstatsFields = []* mapping {
54
+ {
55
+ inProc : "pages per bulk r/w" ,
56
+ reportAs : "pages_per_bulk_rw" ,
57
+ },
58
+ {
59
+ inProc : "discontiguous pages" ,
60
+ reportAs : "discontiguous_pages" ,
61
+ },
62
+ {
63
+ inProc : "disk I/Os in flight" ,
64
+ reportAs : "disk_ios_in_flight" ,
65
+ },
66
+ {
67
+ inProc : "I/O time (1/1000s)" ,
68
+ reportAs : "io_time" ,
69
+ },
70
+ {
71
+ inProc : "disk I/O size" ,
72
+ reportAs : "disk_io_size" ,
73
+ },
74
+ }
75
+
52
76
var wantedOstFields = []* mapping {
53
77
{
54
78
inProc : "write_bytes" ,
@@ -408,10 +432,10 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) e
408
432
}
409
433
410
434
var fields map [string ]interface {}
411
- fields , ok := l .allFields [tags {name , jobid , client }]
435
+ fields , ok := l .allFields [tags {name , "" , "" , jobid , client }]
412
436
if ! ok {
413
437
fields = make (map [string ]interface {})
414
- l .allFields [tags {name , jobid , client }] = fields
438
+ l .allFields [tags {name , "" , "" , jobid , client }] = fields
415
439
}
416
440
417
441
for _ , wanted := range wantedFields {
@@ -440,6 +464,100 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping) e
440
464
return nil
441
465
}
442
466
467
+ func (l * Lustre2 ) getLustreProcBrwStats (fileglob string , wantedFields []* mapping ) error {
468
+ files , err := filepath .Glob (fileglob )
469
+ if err != nil {
470
+ return fmt .Errorf ("failed to find files matching glob %s: %w" , fileglob , err )
471
+ }
472
+
473
+ for _ , file := range files {
474
+ // todo get client too
475
+ // Turn /proc/fs/lustre/obdfilter/<ost_name>/stats and similar into just the object store target name
476
+ // This assumes that the target name is always second to last, which is true in Lustre 2.1->2.12
477
+ path := strings .Split (file , "/" )
478
+ if len (path ) < 2 {
479
+ continue
480
+ }
481
+ name := path [len (path )- 2 ]
482
+
483
+ wholeFile , err := os .ReadFile (file )
484
+ if err != nil {
485
+ return fmt .Errorf ("failed to read file %s: %w" , file , err )
486
+ }
487
+ lines := strings .Split (string (wholeFile ), "\n " )
488
+
489
+ var headerName string
490
+ for _ , line := range lines {
491
+ // There are four types of lines in a brw_stats file:
492
+ // 1. Header lines - contain the category of metric (e.g. disk I/Os in flight, disk I/O time)
493
+ // 2. Bucket lines - follow headers, contain the bucket value (e.g. 4K, 1M) and metric values
494
+ // 3. Empty lines - these will simply be filtered out
495
+ // 4. snapshot_time line - this will be filtered out, as it "looks" like a bucket line
496
+ if len (line ) < 1 {
497
+ continue
498
+ }
499
+ parts := strings .Fields (line )
500
+
501
+ // This is a header line
502
+ // Set report name for use by the buckets that follow
503
+ if strings .Contains (parts [0 ], ":" ) == false {
504
+ nameParts := strings .Split (line , " " )
505
+ headerName = nameParts [0 ]
506
+ continue
507
+ }
508
+
509
+ // snapshot_time should be discarded
510
+ if strings .Contains (parts [0 ], "snapshot_time" ) {
511
+ continue
512
+ }
513
+
514
+ // This is a bucket for a given header
515
+ for _ , wanted := range wantedFields {
516
+ if headerName != wanted .inProc {
517
+ continue
518
+ }
519
+ bucket := strings .TrimSuffix (parts [0 ], ":" )
520
+
521
+ // brw_stats columns are static and don't need configurable fields
522
+ readIos , err := strconv .ParseUint (parts [1 ], 10 , 64 )
523
+ if err != nil {
524
+ return fmt .Errorf ("failed to parse read_ios: %w" , err )
525
+ }
526
+ readPercent , err := strconv .ParseUint (parts [2 ], 10 , 64 )
527
+ if err != nil {
528
+ return fmt .Errorf ("failed to parse read_percent: %w" , err )
529
+ }
530
+ writeIos , err := strconv .ParseUint (parts [5 ], 10 , 64 )
531
+ if err != nil {
532
+ return fmt .Errorf ("failed to parse write_ios: %w" , err )
533
+ }
534
+ writePercent , err := strconv .ParseUint (parts [6 ], 10 , 64 )
535
+ if err != nil {
536
+ return fmt .Errorf ("failed to parse write_percent: %w" , err )
537
+ }
538
+ reportName := headerName
539
+ if wanted .reportAs != "" {
540
+ reportName = wanted .reportAs
541
+ }
542
+
543
+ // todo pass client?
544
+ tag := tags {name , reportName , bucket , "" , "" }
545
+ fields , ok := l .allFields [tag ]
546
+ if ! ok {
547
+ fields = make (map [string ]interface {})
548
+ l .allFields [tag ] = fields
549
+ }
550
+
551
+ fields ["read_ios" ] = readIos
552
+ fields ["read_percent" ] = readPercent
553
+ fields ["write_ios" ] = writeIos
554
+ fields ["write_percent" ] = writePercent
555
+ }
556
+ }
557
+ }
558
+ return nil
559
+ }
560
+
443
561
// Gather reads stats from all lustre targets
444
562
func (l * Lustre2 ) Gather (acc telegraf.Accumulator ) error {
445
563
l .allFields = make (map [tags ]map [string ]interface {})
@@ -460,6 +578,16 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
460
578
if err != nil {
461
579
return err
462
580
}
581
+ // bulk read/wrote statistics for ldiskfs
582
+ err = l .getLustreProcBrwStats ("/proc/fs/lustre/osd-ldiskfs/*/brw_stats" , wantedBrwstatsFields )
583
+ if err != nil {
584
+ return err
585
+ }
586
+ // bulk read/write statistics for zfs
587
+ err = l .getLustreProcBrwStats ("/proc/fs/lustre/osd-zfs/*/brw_stats" , wantedBrwstatsFields )
588
+ if err != nil {
589
+ return err
590
+ }
463
591
}
464
592
465
593
if len (l .MdsProcfiles ) == 0 {
@@ -477,30 +605,52 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
477
605
}
478
606
479
607
for _ , procfile := range l .OstProcfiles {
480
- ostFields := wantedOstFields
481
- if strings .HasSuffix (procfile , "job_stats" ) {
482
- ostFields = wantedOstJobstatsFields
483
- }
484
- err := l .GetLustreProcStats (procfile , ostFields )
485
- if err != nil {
486
- return err
608
+ if strings .HasSuffix (procfile , "brw_stats" ) {
609
+ err := l .getLustreProcBrwStats (procfile , wantedBrwstatsFields )
610
+ if err != nil {
611
+ return err
612
+ }
613
+ } else if strings .HasSuffix (procfile , "job_stats" ) {
614
+ err := l .GetLustreProcStats (procfile , wantedOstJobstatsFields )
615
+ if err != nil {
616
+ return err
617
+ }
618
+ } else {
619
+ err := l .GetLustreProcStats (procfile , wantedOstFields )
620
+ if err != nil {
621
+ return err
622
+ }
487
623
}
488
624
}
489
625
for _ , procfile := range l .MdsProcfiles {
490
- mdtFields := wantedMdsFields
491
- if strings .HasSuffix (procfile , "job_stats" ) {
492
- mdtFields = wantedMdtJobstatsFields
493
- }
494
- err := l .GetLustreProcStats (procfile , mdtFields )
495
- if err != nil {
496
- return err
626
+ if strings .HasSuffix (procfile , "brw_stats" ) {
627
+ err := l .getLustreProcBrwStats (procfile , wantedBrwstatsFields )
628
+ if err != nil {
629
+ return err
630
+ }
631
+ } else if strings .HasSuffix (procfile , "job_stats" ) {
632
+ err := l .GetLustreProcStats (procfile , wantedMdtJobstatsFields )
633
+ if err != nil {
634
+ return err
635
+ }
636
+ } else {
637
+ err := l .GetLustreProcStats (procfile , wantedMdsFields )
638
+ if err != nil {
639
+ return err
640
+ }
497
641
}
498
642
}
499
643
500
644
for tgs , fields := range l .allFields {
501
645
tags := map [string ]string {
502
646
"name" : tgs .name ,
503
647
}
648
+ if len (tgs .brwSection ) > 0 {
649
+ tags ["brw_section" ] = tgs .brwSection
650
+ }
651
+ if len (tgs .bucket ) > 0 {
652
+ tags ["bucket" ] = tgs .bucket
653
+ }
504
654
if len (tgs .job ) > 0 {
505
655
tags ["jobid" ] = tgs .job
506
656
}
0 commit comments