Skip to content

Commit 2925f29

Browse files
leftwoAlan Hanson
and
Alan Hanson
authored
Add more DTrace tooling (#1585)
Added a get-up-state bash and d scripts for looking at the sled overall and producing a high level summary. This script will gather some selected dtrace stats for 10 seconds, then print out a summary. In the first example here we have 3 unique propolis-server processes. We print a line for each PID/Session (a single PID can have multiple sessions) ``` PID SESSION DS0 DS1 DS2 NEXT_JOB DELTA CONN ELR ELC ERR ERN 9972 b5d1cbe7 ACT ACT ACT 65953 12 3 0 0 0 0 12059 69cc7aa8 ACT ACT ACT 2095 0 3 0 0 0 0 12059 924f18ed ACT ACT ACT 1444 0 3 0 0 0 0 12059 d7e7d0fd ACT ACT ACT 30292 0 3 0 0 0 0 12172 74ddab44 ACT ACT ACT 688093 83 3 0 0 0 0 12172 a151673e ACT ACT ACT 2198 0 3 0 0 0 0 ``` I've hacked together a summary of the downstairs states into three letters. Not all states have three letter summaries, but I've captured the common ones. The DELTA is the number of jobs that went through this PID/Session in the 10 seconds we were watching. `CONN` is number of times the upstairs has connected to a downstairs (the sum of all client connections). `ELR` is extents that have been live repaired. `ELC` is extents that were checked during LR, but no repair was needed. `ERR` is extents that were reconcilied (happens on startup). `ERN` is the remaining number of extents we need to reconcile. Here is another example. In this case you can see that some extents were reconciled when propolis first started. ``` PID SESSION DS0 DS1 DS2 NEXT_JOB DELTA CONN ELR ELC ERR ERN 9200 5827dcae ACT ACT ACT 15326 0 3 0 0 0 0 11977 9ab0865f ACT ACT ACT 1309 0 3 0 0 0 0 12595 4878f9f0 ACT ACT ACT 16944 0 3 0 0 0 0 13891 fb840f9f ACT ACT ACT 464968478 38777 3 0 0 400 0 13931 d5613d2e ACT ACT ACT 94948 0 3 0 0 24 0 ``` This status script found the #1579 bug. Updated upstairs_count.d to include barrier operations. --------- Co-authored-by: Alan Hanson <alan@oxide.computer>
1 parent 5a41b82 commit 2925f29

File tree

4 files changed

+181
-4
lines changed

4 files changed

+181
-4
lines changed

tools/dtrace/get-up-state.d

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Display Upstairs status for all matching processes
3+
*/
4+
#pragma D option quiet
5+
#pragma D option strsize=1k
6+
7+
/*
8+
* Print the header right away
9+
*/
10+
dtrace:::BEGIN
11+
{
12+
/*
13+
* We have to init something for last_id so we can use the
14+
* default values for all the session IDs that we don't yet have.
15+
*/
16+
last_id["string"] = (int64_t)1;
17+
printf("%5s %8s ", "PID", "SESSION");
18+
printf("%3s %3s %3s", "DS0", "DS1", "DS2");
19+
printf(" %10s %5s %4s", "NEXT_JOB", "DELTA", "CONN");
20+
printf(" %5s %5s", "ELR", "ELC");
21+
printf(" %5s %5s", "ERR", "ERN");
22+
printf("\n");
23+
}
24+
25+
/*
26+
* After reporting for 10 seconds, exit
27+
*/
28+
tick-10s
29+
{
30+
exit(0);
31+
}
32+
33+
/*
34+
* All variables should be this->
35+
* Otherwise, there is a chance another probe will fire and
36+
* clobber the contents.
37+
*/
38+
crucible_upstairs*:::up-status
39+
{
40+
this->ds0state = json(copyinstr(arg1), "ok.ds_state[0]");
41+
if (this->ds0state == "active") {
42+
this->d0 = "ACT";
43+
} else if (this->ds0state == "new") {
44+
this->d0 = "NEW";
45+
} else if (this->ds0state == "live_repair_ready") {
46+
this->d0 = "LRR";
47+
} else if (this->ds0state == "live_repair") {
48+
this->d0 = " LR";
49+
} else if (this->ds0state == "faulted") {
50+
this->d0 = "FLT";
51+
} else if (this->ds0state == "offline") {
52+
this->d0 = "OFL";
53+
} else {
54+
this->d0 = this->ds0state;
55+
}
56+
57+
this->ds1state = json(copyinstr(arg1), "ok.ds_state[1]");
58+
if (this->ds1state == "active") {
59+
this->d1 = "ACT";
60+
} else if (this->ds1state == "new") {
61+
this->d1 = "NEW";
62+
} else if (this->ds1state == "live_repair_ready") {
63+
this->d1 = "LRR";
64+
} else if (this->ds1state == "live_repair") {
65+
this->d1 = " LR";
66+
} else if (this->ds1state == "faulted") {
67+
this->d1 = "FLT";
68+
} else if (this->ds1state == "offline") {
69+
this->d1 = "OFL";
70+
} else {
71+
this->d1 = this->ds1state;
72+
}
73+
74+
this->ds2state = json(copyinstr(arg1), "ok.ds_state[2]");
75+
if (this->ds2state == "active") {
76+
this->d2 = "ACT";
77+
} else if (this->ds2state == "new") {
78+
this->d2 = "NEW";
79+
} else if (this->ds2state == "live_repair_ready") {
80+
this->d2 = "LRR";
81+
} else if (this->ds2state == "live_repair") {
82+
this->d2 = " LR";
83+
} else if (this->ds2state == "faulted") {
84+
this->d2 = "FLT";
85+
} else if (this->ds2state == "offline") {
86+
this->d2 = "OFL";
87+
} else {
88+
this->d2 = this->ds2state;
89+
}
90+
91+
/*
92+
* All these local variables require the "this->" so the probe firing
93+
* from different sessions don't collide with each other.
94+
*/
95+
this->full_session_id = json(copyinstr(arg1), "ok.session_id");
96+
this->session_id = substr(this->full_session_id, 0, 8);
97+
98+
this->next_id_str = json(copyinstr(arg1), "ok.next_job_id");
99+
this->next_id_value = strtoll(this->next_id_str);
100+
101+
if (last_id[this->session_id] == 0) {
102+
this->delta = 0;
103+
last_id[this->session_id] = this->next_id_value;
104+
} else {
105+
this->delta = this->next_id_value - last_id[this->session_id];
106+
}
107+
108+
/* Total of extents live repaired */
109+
this->elr = strtoll(json(copyinstr(arg1), "ok.ds_extents_repaired[0]")) +
110+
strtoll(json(copyinstr(arg1), "ok.ds_extents_repaired[1]")) +
111+
strtoll(json(copyinstr(arg1), "ok.ds_extents_repaired[2]"));
112+
/* Total of extents not needing repair during live repair */
113+
this->elc = strtoll(json(copyinstr(arg1), "ok.ds_extents_confirmed[0]")) +
114+
strtoll(json(copyinstr(arg1), "ok.ds_extents_confirmed[1]")) +
115+
strtoll(json(copyinstr(arg1), "ok.ds_extents_confirmed[2]"));
116+
117+
this->connections = strtoll(json(copyinstr(arg1), "ok.ds_connected[0]")) +
118+
strtoll(json(copyinstr(arg1), "ok.ds_connected[1]")) +
119+
strtoll(json(copyinstr(arg1), "ok.ds_connected[2]"));
120+
121+
printf("%5d %8s %3s %3s %3s %10s %5d %4d %5d %5d %5s %5s\n",
122+
pid,
123+
this->session_id,
124+
/*
125+
* State for the three downstairs
126+
*/
127+
this->d0,
128+
this->d1,
129+
this->d2,
130+
131+
/*
132+
* Job ID, job delta and write bytes outstanding
133+
*/
134+
json(copyinstr(arg1), "ok.next_job_id"),
135+
this->delta,
136+
this->connections,
137+
this->elr,
138+
this->elc,
139+
json(copyinstr(arg1), "ok.ds_reconciled"),
140+
json(copyinstr(arg1), "ok.ds_reconcile_needed"));
141+
}

tools/dtrace/get-up-state.sh

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
filename='/tmp/get-up-state.out'
4+
final='/tmp/get-up-state.final'
5+
rm -f $final
6+
7+
# Gather our output first.
8+
dtrace -s /opt/oxide/crucible_dtrace/get-up-state.d | awk 'NF' > "$filename"
9+
10+
# For each session we find, get the latest line and store that in
11+
# the result file.
12+
for id in $(cat $filename | grep -v SESSION | awk '{print $2}' | sort -n | uniq); do
13+
# Find our session, then print the final line
14+
grep "$id" "$filename" | tail -1 >> $final
15+
done
16+
# Print the header
17+
grep "SESSION" "$filename"
18+
# Sort our result by PID and print it out.
19+
sort -n < $final

tools/dtrace/upstairs_count.d

+19-4
Original file line numberDiff line numberDiff line change
@@ -59,24 +59,37 @@ crucible_upstairs*:::gw-write-unwritten-done
5959
@write_unwritten_done = count();
6060
}
6161

62+
crucible_upstairs*:::gw-barrier-start
63+
/pid == $1/
64+
{
65+
@barrier_start = count();
66+
}
67+
68+
crucible_upstairs*:::gw-barrier-done
69+
/pid == $1/
70+
{
71+
@barrier_done = count();
72+
}
73+
6274
/*
6375
* Every second, check and see if we have printed enough that it is
6476
* time to print the header again
6577
*/
6678
tick-1s
6779
/show > 20/
6880
{
69-
printf("%4s %4s %4s %4s %5s %5s %4s %4s",
70-
"F>", "F<", "W>", "W<", "R>", "R<", "WU>", "WU<");
81+
printf("%4s %4s %4s %4s %5s %5s %4s %4s %4s %4s",
82+
"F>", "F<", "W>", "W<", "R>", "R<", "WU>", "WU<", "B>", "B<");
7183
printf("\n");
7284
show = 0;
7385
}
7486

7587
tick-1s
7688
{
77-
printa("%@4u %@4u %@4u %@4u %@5u %@5u %@4u %@4u",
89+
printa("%@4u %@4u %@4u %@4u %@5u %@5u %@4u %@4u %@4u %@4u",
7890
@flush_start, @flush_done, @write_start, @write_done,
79-
@read_start, @read_done, @write_unwritten_start, @write_unwritten_done
91+
@read_start, @read_done, @write_unwritten_start, @write_unwritten_done,
92+
@barrier_start, @barrier_done
8093
);
8194
printf("\n");
8295
clear(@flush_start);
@@ -87,5 +100,7 @@ tick-1s
87100
clear(@read_done);
88101
clear(@write_unwritten_start);
89102
clear(@write_unwritten_done);
103+
clear(@barrier_start);
104+
clear(@barrier_done);
90105
show = show + 1;
91106
}

tools/make-dtrace.sh

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ tar cvf ../../out/crucible-dtrace.tar \
2727
get-ds-state.sh \
2828
get-lr-state.d \
2929
get-lr-state.sh \
30+
get-up-state.d \
31+
get-up-state.sh \
3032
perf-downstairs-os.d \
3133
perf-downstairs-three.d \
3234
perf-downstairs-tick.d \

0 commit comments

Comments
 (0)