oxidecomputer
diff --git a/‎Cargo.toml
+3 b/‎Cargo.toml
+3
diff --git a/‎tools/test_fail_live_repair.sh
+58-14 b/‎tools/test_fail_live_repair.sh
+58-14
diff --git a/‎upstairs/src/active_jobs.rs
+5-90 b/‎upstairs/src/active_jobs.rs
+5-90
diff --git a/‎upstairs/src/block_req.rs
+5 b/‎upstairs/src/block_req.rs
+5
@@ -130,6 +130,9 @@ repair-client = { path = "./repair-client" }
 [profile.dev]
 panic = 'abort'
 
+[profile.release]
+panic = 'abort'
+
 # Using the workspace-hack via this patch directive means that it only applies
 # while building within this workspace. If another workspace imports a crate
 # from here via a git dependency, it will not have the workspace-hack applied
 
@@ -27,6 +27,7 @@ function ctrl_c() {
         kill "$crutest_pid"
         wait "$crutest_pid"
     fi
+    exit 1
 }
 
 loop_log=/tmp/test_fail_live_repair_summary.log
@@ -53,6 +54,24 @@ if pgrep -fl -U "$(id -u)" "$cds"; then
     exit 1
 fi
 
+loops=20
+
+usage () {
+    echo "Usage: $0 [-l #]" >&2
+    echo " -l loops   Number of test loops to perform (default 20)" >&2
+}
+
+while getopts 'l:' opt; do
+    case "$opt" in
+        l)  loops=$OPTARG
+            ;;
+        *)  echo "Invalid option"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
 echo "" > ${loop_log}
 echo "" > ${test_log}
 echo "" > ${dsc_test_log}
@@ -93,17 +112,18 @@ if ! "$crucible_test" fill "${args[@]}" -q -g "$gen"\
           --retry-activate >> "$test_log" 2>&1 ; then
     echo Failed on initial verify seed, check "$test_log"
     ${dsc} cmd shutdown
+    exit 1
 fi
 (( gen += 1 ))
 
-for i in {1..20}
-do
+count=1
+while [[ $count -le $loops ]]; do
     SECONDS=0
     choice=$((RANDOM % 3))
 
-    # The state of our chosen downstairs is based on an offset
+    # Clear the log on each loop
     echo "" > "$test_log"
-    echo "New loop starts now $(date) faulting: $choice" >> "$test_log"
+    echo "New loop starts now $(date) faulting: $choice" | tee -a "$test_log"
     # Start sending IO.
     "$crucible_test" generic "${args[@]}" --continuous \
         -q -g "$gen" --verify-out "$verify_file" \
@@ -113,9 +133,25 @@ do
     crutest_pid=$!
     sleep 5
 
-    curl -X POST http://127.0.0.1:7777/downstairs/fault/"${choice}"
+    ${dsc} cmd stop -c "$choice" >> "$dsc_test_log" 2>&1 &
+    # Wait for our downstairs to fault
+    echo Wait for our downstairs to fault | tee -a "$test_log"
+    choice_state="undefined"
+    while [[ "$choice_state" != "faulted" ]]; do
+        sleep 3
+        if [[ $choice -eq 0 ]]; then
+            choice_state=$(curl -s http://127.0.0.1:7777/info | awk -F\" '{print $8}')
+        elif [[ $choice -eq 1 ]]; then
+            choice_state=$(curl -s http://127.0.0.1:7777/info | awk -F\" '{print $10}')
+        else
+            choice_state=$(curl -s http://127.0.0.1:7777/info | awk -F\" '{print $12}')
+        fi
+    done
+
+    ${dsc} cmd start -c "$choice" >> "$dsc_test_log" 2>&1 &
 
     # Wait for our downstairs to begin live_repair
+    echo Wait for our downstairs to begin live_repair | tee -a "$test_log"
     choice_state="undefined"
     while [[ "$choice_state" != "live_repair" ]]; do
         sleep 3
@@ -128,19 +164,26 @@ do
         fi
     done
 
-    # Let the repair do some repairing.
-    sleep 5
+    # Give the live repair between 5 and 25 seconds to start repairing.
+    rand_sleep=$((RANDOM % 20))
+    ((rand_sleep += 5))
+    sleep $rand_sleep
+    echo "After $rand_sleep seconds, Stop $choice again" | tee -a "$test_log"
+    ${dsc} cmd stop -c "$choice" >> "$dsc_test_log" 2>&1 &
 
-    # Now fault the downstairs again.
-    curl -X POST http://127.0.0.1:7777/downstairs/fault/"${choice}"
+    sleep 2
+    echo "Start $choice for a second time" | tee -a "$test_log"
+    ${dsc} cmd start -c "$choice" >> "$dsc_test_log" 2>&1 &
 
     # Now wait for all downstairs to be active
+    echo Now wait for all downstairs to be active | tee -a "$test_log"
     all_state=$(curl -s http://127.0.0.1:7777/info | awk -F\" '{print $8","$10","$12}')
     while [[ "${all_state}" != "active,active,active" ]]; do
         sleep 5
         all_state=$(curl -s http://127.0.0.1:7777/info | awk -F\" '{print $8","$10","$12}')
     done
 
+    echo All downstairs active, now stop IO test and wait for it to finish | tee -a "$test_log"
     kill -SIGUSR1 $crutest_pid
     wait $crutest_pid
     result=$?
@@ -150,7 +193,7 @@ do
         else
             (( err += 1 ))
             duration=$SECONDS
-            printf "[%03d] Error $result after %d:%02d\n" "$i" \
+            printf "[%03d] Error $result after %d:%02d\n" "$count" \
                     $((duration / 60)) $((duration % 60)) | tee -a ${loop_log}
             mv "$test_log" "$test_log".lastfail
             break
@@ -166,7 +209,7 @@ do
             (( dropshot += 1 ))
         else
             mv "$test_log" "$test_log".lastfail
-            echo "verify failed on loop $i"
+            echo "verify failed on loop $count"
             (( err += 1 ))
             break
         fi
@@ -179,12 +222,13 @@ do
     ave=$(( total / pass_total ))
     printf \
       "[%03d][%d] %d:%02d  ds_err:%d ave:%d:%02d total:%d:%02d last_run:%d\n" \
-      "$i" "$choice" \
+      "$count" "$choice" \
       $((duration / 60)) $((duration % 60)) \
       "$dropshot"  \
       $((ave / 60)) $((ave % 60))  $((total / 60)) $((total % 60)) \
       "$duration" | tee -a ${loop_log}
 
+    (( count += 1 ))
 done
 
 # Stop dsc.
@@ -194,9 +238,9 @@ wait ${dsc_pid}
 echo "Final results:" | tee -a ${loop_log}
 printf \
   "[%03d] %d:%02d  ave:%d:%02d  total:%d:%02d errors:%d last_run_seconds:%d\n" \
-  "$i" $((duration / 60)) $((duration % 60)) \
+  "$count" $((duration / 60)) $((duration % 60)) \
   $((ave / 60)) $((ave % 60)) $((total / 60)) $((total % 60)) \
   "$err" $duration | tee -a ${loop_log}
-echo "$(date) Test ends with $err" >> "$test_log"
+echo "$(date) Test ends with $err" | tee -a "$test_log"
 exit "$err"
 
@@ -3,23 +3,15 @@
 use crucible_protocol::JobId;
 
 use crate::{
-    AckStatus, DownstairsIO, ExtentRepairIDs, IOop, ImpactedAddr,
-    ImpactedBlocks,
+    DownstairsIO, ExtentRepairIDs, IOop, ImpactedAddr, ImpactedBlocks,
 };
 use std::collections::{BTreeMap, BTreeSet};
 
-/// `ActiveJobs` tracks active jobs by ID
+/// `ActiveJobs` tracks active jobs (and associated metadata) by job ID
 ///
 /// It exposes an API that roughly matches a `BTreeMap<JobId, DownstairsIO>`,
 /// but leaves open the possibility for further optimization.
 ///
-/// Notably, there is no way to directly modify a `DownstairsIO` contained in
-/// `ActiveJobs`.  Bulk modification can be done with `for_each`, and individual
-/// modification can be done with `get_mut`, which returns a
-/// `DownstairsIOHandle` instead of a raw `&mut DownstairsIO`.  All of this
-/// means that we can keep extra metadata in sync, e.g. a list of all ackable
-/// jobs.
-///
 /// The `ActiveJobs` structure also includes a data structure ([`BlockMap`])
 /// which accelerates dependency tracking: it tracks the most recent blocking
 /// (write / flush / etc) and non-blocking (read) jobs on a per-block basis,
@@ -29,7 +21,6 @@ use std::collections::{BTreeMap, BTreeSet};
 #[derive(Debug, Default)]
 pub(crate) struct ActiveJobs {
     jobs: BTreeMap<JobId, DownstairsIO>,
-    ackable: BTreeSet<JobId>,
     block_to_active: BlockMap,
 }
 
@@ -46,10 +37,8 @@ impl ActiveJobs {
 
     /// Looks up a job by ID, returning a mutable reference
     #[inline]
-    pub fn get_mut(&mut self, job_id: &JobId) -> Option<DownstairsIOHandle> {
-        self.jobs
-            .get_mut(job_id)
-            .map(|job| DownstairsIOHandle::new(job, &mut self.ackable))
+    pub fn get_mut(&mut self, job_id: &JobId) -> Option<&mut DownstairsIO> {
+        self.jobs.get_mut(job_id)
     }
 
     /// Returns the total number of active jobs
@@ -68,8 +57,7 @@ impl ActiveJobs {
     #[inline]
     pub fn for_each<F: FnMut(&JobId, &mut DownstairsIO)>(&mut self, mut f: F) {
         for (job_id, job) in self.jobs.iter_mut() {
-            let handle = DownstairsIOHandle::new(job, &mut self.ackable);
-            f(job_id, handle.job);
+            f(job_id, job);
         }
     }
 
@@ -209,10 +197,6 @@ impl ActiveJobs {
         dep
     }
 
-    pub fn ackable_work(&self) -> BTreeSet<JobId> {
-        self.ackable.clone()
-    }
-
     #[cfg(test)]
     pub fn get_extents_for(&self, job: JobId) -> ImpactedBlocks {
         *self.block_to_active.job_to_range.get(&job).unwrap()
@@ -230,75 +214,6 @@ impl<'a> IntoIterator for &'a ActiveJobs {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Handle for a `DownstairsIO` that keeps secondary data in sync
-///
-/// Many parts of the code want to modify a `DownstairsIO` by directly poking
-/// its fields.  This makes it hard to keep secondary data in sync, e.g.
-/// maintaining a separate list of all ackable IOs.
-pub(crate) struct DownstairsIOHandle<'a> {
-    pub job: &'a mut DownstairsIO,
-    initial_status: AckStatus,
-    ackable: &'a mut BTreeSet<JobId>,
-}
-
-impl<'a> std::fmt::Debug for DownstairsIOHandle<'a> {
-    fn fmt(
-        &self,
-        f: &mut std::fmt::Formatter<'_>,
-    ) -> Result<(), std::fmt::Error> {
-        self.job.fmt(f)
-    }
-}
-
-impl<'a> DownstairsIOHandle<'a> {
-    fn new(
-        job: &'a mut DownstairsIO,
-        ackable: &'a mut BTreeSet<JobId>,
-    ) -> Self {
-        let initial_status = job.ack_status;
-        Self {
-            job,
-            initial_status,
-            ackable,
-        }
-    }
-
-    pub fn job(&mut self) -> &mut DownstairsIO {
-        self.job
-    }
-}
-
-impl<'a> std::ops::Drop for DownstairsIOHandle<'a> {
-    fn drop(&mut self) {
-        match (self.initial_status, self.job.ack_status) {
-            (AckStatus::NotAcked, AckStatus::AckReady) => {
-                let prev = self.ackable.insert(self.job.ds_id);
-                assert!(prev);
-            }
-            (AckStatus::AckReady, AckStatus::Acked | AckStatus::NotAcked) => {
-                let prev = self.ackable.remove(&self.job.ds_id);
-                assert!(prev);
-            }
-            // None transitions
-            (AckStatus::AckReady, AckStatus::AckReady)
-            | (AckStatus::Acked, AckStatus::Acked)
-            | (AckStatus::NotAcked, AckStatus::NotAcked) => (),
-
-            // Invalid transitions!
-            (AckStatus::NotAcked, AckStatus::Acked)
-            | (AckStatus::Acked, AckStatus::NotAcked)
-            | (AckStatus::Acked, AckStatus::AckReady) => {
-                panic!(
-                    "invalid transition: {:?} => {:?}",
-                    self.initial_status, self.job.ack_status
-                )
-            }
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 /// Acceleration data structure to quickly look up dependencies
 #[derive(Debug, Default)]
 struct BlockMap {
 
@@ -42,6 +42,11 @@ impl BlockReq {
         // XXX this eats the result!
         let _ = self.sender.send(r);
     }
+
+    /// Consume this BlockReq and return the inner oneshot sender
+    pub fn take_sender(self) -> oneshot::Sender<Result<(), CrucibleError>> {
+        self.sender
+    }
 }
 
 /**
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,11 @@ impl BlockReq {`
`42`	`42`	`// XXX this eats the result!`
`43`	`43`	`let _ = self.sender.send(r);`
`44`	`44`	`}`
	`45`	`+`
	`46`	`+ /// Consume this BlockReq and return the inner oneshot sender`
	`47`	`+ pub fn take_sender(self) -> oneshot::Sender<Result<(), CrucibleError>> {`
	`48`	`+ self.sender`
	`49`	`+ }`
`45`	`50`	`}`
`46`	`51`
`47`	`52`	`/**`