Allow read only activation with less than three downstairs (#1608)

leftwo · web-flow · commit ace54d9cb595 · 2025-01-24T15:38:57.000-08:00
Allow a read only upstairs to activate with one a single downstairs present. In upstairs/src/upstairs.rs, I've added a check when a downstairs transitions to `WaitQuorum`. If we are read-only, then we can skip reconciliation and activate the upstairs. If we are already active (and read only), then a new downstairs can go to active. Added some tests and a bit of additional test framework to verify an upstairs can activate with only a single downstairs ready. This "fixes" the feature request in #1599 and may help with #1593
diff --git a/tools/test_read_only.sh b/tools/test_read_only.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+#
+# Tests of allowing a read only Upstairs with < 3 running downstairs
+# We do an initial RW fill, and record what data we expect
+# Then, stop and restart the downstairs as read only.
+# Loop over 1 missing downstairs, activate and verify our volume.
+# Loop over 2 missing downstairs, activate and verify our volume.
+set -o pipefail
+SECONDS=0
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+BINDIR=${BINDIR:-$ROOT/target/debug}
+
+echo "$ROOT"
+cd "$ROOT" || (echo failed to cd "$ROOT"; exit 1)
+
+if pgrep -fl crucible-downstairs; then
+    echo 'Downstairs already running?' >&2
+    exit 1
+fi
+
+cds="$BINDIR/crucible-downstairs"
+crutest="$BINDIR/crutest"
+dsc="$BINDIR/dsc"
+for bin in $cds $crutest $dsc; do
+    if [[ ! -f "$bin" ]]; then
+        echo "Can't find crucible binary at $bin" >&2
+        exit 1
+    fi
+done
+
+# Use the id of the current user to make a unique path
+user=$(id -u -n)
+# Downstairs regions go in this directory
+testdir="/var/tmp/test_read_only-$user"
+if [[ -d "$testdir" ]]; then
+    rm -rf "$testdir"
+fi
+
+# Store log files we want to keep in /tmp/test_read_only-$user/.txt as this is what
+# buildomat will look for and archive
+test_output_dir="/tmp/test_read_only-$user"
+rm -rf "$test_output_dir" 2> /dev/null
+mkdir -p "$test_output_dir"
+log_prefix="${test_output_dir}/test_read_only"
+fail_log="${log_prefix}_fail.txt"
+rm -f "$fail_log"
+verify_file="${log_prefix}_verify"
+rm -f "$verify_file"
+test_log="${log_prefix}_log"
+rm -f "$test_log"
+dsc_output_dir="${test_output_dir}/dsc"
+mkdir -p "$dsc_output_dir"
+dsc_output="${test_output_dir}/dsc-out.txt"
+echo "dsc output goes to $dsc_output"
+
+region_count=3
+args=()
+dsc_args=()
+dsc_create_args=()
+upstairs_key=$(openssl rand -base64 32)
+echo "Upstairs using key: $upstairs_key" | tee -a "$test_log"
+
+args+=( --key "$upstairs_key" )
+args+=( --dsc "127.0.0.1:9998" )
+dsc_create_args+=( --encrypted )
+dsc_create_args+=( --cleanup )
+dsc_args+=( --output-dir "$dsc_output_dir" )
+dsc_args+=( --ds-bin "$cds" )
+dsc_args+=( --region-dir "$testdir" )
+
+# Control-C to cleanup.
+trap ctrl_c INT
+function ctrl_c() {
+	echo "Stopping at your request" | tee -a "$test_log"
+    ${dsc} cmd shutdown
+    exit 1
+}
+
+echo "Creating $region_count downstairs regions"
+echo "${dsc}" create "${dsc_create_args[@]}" --region-count "$region_count" --extent-size 10 --extent-count 5 "${dsc_args[@]}" > "$dsc_output"
+"${dsc}" create "${dsc_create_args[@]}" --region-count "$region_count" --extent-size 10 --extent-count 5 "${dsc_args[@]}" >> "$dsc_output" 2>&1
+
+echo "Starting $region_count downstairs"
+echo "${dsc}" start "${dsc_args[@]}" --region-count "$region_count" >> "$dsc_output"
+"${dsc}" start "${dsc_args[@]}" --region-count "$region_count" >> "$dsc_output" 2>&1 &
+dsc_pid=$!
+echo "dsc started at PID: $dsc_pid"
+
+while "$dsc" cmd all-running | grep false > /dev/null ; do
+    echo "Wait for all clients to be running."
+    sleep 3
+done
+
+echo ""
+echo "Begin tests, output goes to $test_log"
+gen=1
+
+# Initial fill and creation of verify file for future use.
+echo "$crutest" generic -g $gen "${args[@]}" --stable --verify-out "$verify_file" -c 250 | tee -a "$test_log"
+if ! "$crutest" generic -g $gen "${args[@]}" --stable --verify-out "$verify_file" -c 250 >> "$test_log"; then
+    echo "Failed initial fill"
+    exit 1
+fi
+(( gen += 1 ))
+
+echo "Shutdown dsc" | tee -a "$test_log"
+"$dsc" cmd shutdown
+wait $dsc_pid
+
+echo "Starting dsc in read only mode with $region_count downstairs" | tee -a "$test_log"
+echo "${dsc}" start "${dsc_args[@]}" --read-only --region-count $region_count >> "$dsc_output"
+"${dsc}" start "${dsc_args[@]}" --read-only --region-count $region_count >> "$dsc_output" 2>&1 &
+dsc_pid=$!
+echo "dsc started at PID: $dsc_pid" | tee -a "$test_log"
+
+loop=0
+while [[ "$loop" -lt 10 ]]; do
+    echo "$(date) Begin loop $loop" | tee -a "$test_log"
+    # Begin with all downstairs running
+    "$dsc" cmd start-all | tee -a "$test_log"
+
+    # In this First loop, we just turn off one downstairs and then verify our
+    # volume with two downstairs running.
+    for cid in {0..2}; do
+
+        while "$dsc" cmd all-running | grep false > /dev/null ; do
+            echo "Wait for all clients to be running." | tee -a "$test_log"
+            sleep 3
+        done
+
+        echo "Stop downstairs $cid" | tee -a "$test_log"
+        "$dsc" cmd stop -c "$cid" | tee -a "$test_log"
+
+        while ! "$dsc" cmd state -c "$cid" | grep Exit > /dev/null; do
+            echo "Waiting for client $cid to stop" | tee -a "$test_log"
+            sleep 3
+        done
+
+        echo "Run verify test with downstairs $cid stopped" | tee -a "$test_log"
+        echo "$crutest" verify -g "$gen" "${args[@]}" -q --verify-in "$verify_file" --read-only | tee -a "$test_log"
+        if ! "$crutest" verify -g "$gen" "${args[@]}" -q --verify-in "$verify_file" --read-only >> "$test_log" 2>&1 ; then
+            echo "Failed first test at loop $loop, cid: $cid"
+            exit 1
+        fi
+
+        echo "Start downstairs $cid" | tee -a "$test_log"
+        "$dsc" cmd start -c "$cid" | tee -a "$test_log"
+
+    done
+
+    while "$dsc" cmd all-running | grep false > /dev/null ; do
+        echo "Wait for all clients to be running." | tee -a "$test_log"
+        sleep 3
+    done
+
+    # Begin the next loop with all downstairs stopped
+    echo "Stopping all downstairs" | tee -a "$test_log"
+    "$dsc" cmd stop-all
+
+    # Second loop. Here we just start just one downstairs (leaving all the
+    # other downstairs stopped)
+    for cid in {0..2}; do
+
+        echo "Start just downstairs $cid" | tee -a "$test_log"
+        "$dsc" cmd start -c "$cid"
+
+        # Wait for just our one downstairs to be running.
+        for stopped_cid in {0..2}; do
+            if [[ "$stopped_cid" -eq "$cid" ]]; then
+                continue
+            fi
+            while ! "$dsc" cmd state -c "$stopped_cid" | grep Exit > /dev/null; do
+                echo "Waiting for client $stopped_cid to stop" | tee -a "$test_log"
+                sleep 3
+            done
+        done
+
+        echo "Run read only test with only downstairs $cid running" | tee -a "$test_log"
+        echo "$crutest" verify -g "$gen" "${args[@]}" -q --verify-in "$verify_file" --read-only >> "$test_log"
+        if ! "$crutest" verify -g "$gen" "${args[@]}" -q --verify-in "$verify_file" --read-only >> "$test_log" 2>&1 ; then
+            echo "Failed second test at loop $loop, cid: $cid"
+            exit 1
+        fi
+
+        # stop downstairs $cid
+        echo "Stop downstairs $cid" | tee -a "$test_log"
+        "$dsc" cmd stop -c "$cid"
+
+    done
+    echo "$(date) End of loop $loop" | tee -a "$test_log"
+    ((loop += 1))
+done
+
+## loop done
+echo "Shutdown dsc" | tee -a "$test_log"
+"$dsc" cmd shutdown
+wait $dsc_pid
+
+duration=$SECONDS
+printf "Test with %d loops took: %d:%02d\n" \
+    "$loop" $((duration / 60)) $((duration % 60)) | tee -a "$test_log"
diff --git a/upstairs/src/client.rs b/upstairs/src/client.rs
@@ -857,6 +857,14 @@ impl DownstairsClient {
     ) -> EnqueueResult {
         match self.state {
             // We never send jobs if we're in certain inactive states
+            DsState::Connecting {
+                mode: ConnectionMode::New,
+                ..
+            } if self.cfg.read_only => {
+                // Read only upstairs can connect with just a single downstairs
+                // ready, we skip jobs on the other downstairs till they connect.
+                EnqueueResult::Skip
+            }
             DsState::Connecting {
                 mode: ConnectionMode::Faulted | ConnectionMode::Replaced,
                 ..
@@ -897,7 +905,7 @@ impl DownstairsClient {
 
             DsState::Stopping(ClientStopReason::Deactivated)
             | DsState::Connecting {
-                mode: ConnectionMode::New,
+                mode: ConnectionMode::New, // RO client checked above
                 ..
             } => panic!(
                 "enqueue should not be called from state {:?}",
diff --git a/upstairs/src/downstairs.rs b/upstairs/src/downstairs.rs
@@ -1918,6 +1918,33 @@ impl Downstairs {
         self.reconcile_repair_aborted += 1;
     }
 
+    /// Initial reconciliation was skipped, sets all WQ clients as Active
+    pub(crate) fn on_reconciliation_skipped(&mut self, go_active: bool) {
+        assert!(self.reconcile.is_none());
+        if go_active {
+            assert!(self.ds_active.is_empty());
+        }
+
+        for (i, c) in self.clients.iter_mut().enumerate() {
+            if matches!(
+                c.state(),
+                DsState::Connecting {
+                    state: NegotiationState::WaitQuorum,
+                    ..
+                }
+            ) {
+                c.set_active();
+            } else {
+                warn!(
+                    self.log,
+                    "client {} is in state {:?} not ready for activation",
+                    i,
+                    c.state(),
+                );
+            }
+        }
+    }
+
     /// Asserts that initial reconciliation is done, and sets clients as Active
     ///
     /// # Panics
diff --git a/upstairs/src/dummy_downstairs_tests.rs b/upstairs/src/dummy_downstairs_tests.rs
@@ -2871,6 +2871,111 @@ async fn test_no_send_offline() {
     }
 }
 
+async fn test_ro_activate_from_list(activate: [bool; 3]) {
+    let log = csl();
+
+    let cfg = DownstairsConfig {
+        read_only: true,
+        reply_to_ping: true,
+        extent_count: DEFAULT_EXTENT_COUNT,
+        extent_size: Block::new_512(DEFAULT_BLOCK_COUNT),
+        gen_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize],
+        flush_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize],
+        dirty_bits: vec![false; DEFAULT_EXTENT_COUNT as usize],
+    };
+
+    let mut ds1 = cfg.clone().start(log.new(o!("downstairs" => 1))).await;
+    let mut ds2 = cfg.clone().start(log.new(o!("downstairs" => 2))).await;
+    let mut ds3 = cfg.clone().start(log.new(o!("downstairs" => 3))).await;
+
+    let (g, io) = Guest::new(Some(log.clone()));
+    let guest = Arc::new(g);
+
+    let crucible_opts = CrucibleOpts {
+        id: Uuid::new_v4(),
+        target: vec![ds1.local_addr, ds2.local_addr, ds3.local_addr],
+        flush_timeout: Some(1.0),
+        read_only: true,
+
+        ..Default::default()
+    };
+
+    let join_handle = up_main(crucible_opts, 1, None, io, None).unwrap();
+
+    let mut handles: Vec<JoinHandle<()>> = vec![];
+    {
+        let guest = guest.clone();
+        handles.push(tokio::spawn(async move {
+            guest.activate().await.unwrap();
+        }));
+    }
+
+    // Move negotiation along for downstairs we want to activate.
+    for (i, ds) in [&mut ds1, &mut ds2, &mut ds3].iter_mut().enumerate() {
+        if activate[i] {
+            info!(log, "Activate downstairs {i}");
+            ds.negotiate_start().await;
+            ds.negotiate_step_extent_versions_please().await;
+        }
+    }
+
+    for _ in 0..10 {
+        if guest.query_is_active().await.unwrap() {
+            break;
+        }
+
+        tokio::time::sleep(Duration::from_secs(1)).await;
+    }
+
+    assert!(guest.query_is_active().await.unwrap());
+
+    // Create our test harness so we can send IO.
+    let mut harness = TestHarness {
+        log: log.clone(),
+        ds1: Some(ds1),
+        ds2,
+        ds3,
+        _join_handle: join_handle,
+        guest,
+    };
+
+    // We must `spawn` here because `read` will wait for the response to
+    // come back before returning
+    let h = harness.spawn(|guest| async move {
+        let mut buffer = Buffer::new(1, 512);
+        guest.read(BlockIndex(0), &mut buffer).await.unwrap();
+    });
+
+    // Ack the read on the downstairs that are active.
+    if activate[0] {
+        harness.ds1().ack_read().await;
+    }
+    if activate[1] {
+        harness.ds2.ack_read().await;
+    }
+    if activate[2] {
+        harness.ds3.ack_read().await;
+    }
+
+    h.await.unwrap(); // after > 1x response, the read finishes
+}
+
+#[tokio::test]
+async fn test_ro_activate_with_one() {
+    // Verify ro upstairs can activate with just one downstairs ready.
+    test_ro_activate_from_list([true, false, false]).await;
+    test_ro_activate_from_list([false, true, false]).await;
+    test_ro_activate_from_list([false, false, true]).await;
+}
+
+#[tokio::test]
+async fn test_ro_activate_with_two() {
+    // Verify ro upstairs will activate with only two downstairs ready.
+    test_ro_activate_from_list([true, true, false]).await;
+    test_ro_activate_from_list([true, false, true]).await;
+    test_ro_activate_from_list([false, true, true]).await;
+}
+
 /// Test that barrier operations are sent periodically
 #[tokio::test]
 async fn test_jobs_based_barrier() {
diff --git a/upstairs/src/upstairs.rs b/upstairs/src/upstairs.rs