Skip to content

not ready for consumption yet #2

not ready for consumption yet

not ready for consumption yet #2

Workflow file for this run

name: Sunbeam Multi-node HA
#on:
#push:
# branches: ["act"]
# workflow_dispatch:
# inputs:
# use_workaround:
# description: 'Apply a workaround'
# required: true
# default: true
# type: boolean
# hardware_profile:
# description: 'Specs for each machine'
# required: true
# default: tutorial
# type: choice
# options:
# - minimal
# - minimal-with-cpu-overcommit
# - tutorial
# - allowance
permissions:
contents: read
env:
COLUMNS: 160 # default: 80
DEBIAN_FRONTEND: noninteractive
# github.event.inputs.use_workaround returns a string
# - non-empty 'true' or 'false' from the workflow_dispatch
# - '' from the on-push event
# set the default value as true only when there is no input
USE_WORKAROUND: ${{ github.event.inputs.use_workaround || true }}
HARDWARE_PROFILE: ${{ inputs.hardware_profile || 'tutorial' }}
defaults:
run:
# act doesn't show a good summary of elapsed time of each step, and
# having an unnecessary `bash -e -c` at the beginning is for
# actionlint to activate shellcheck.
shell: bash -e -c '/usr/bin/time -f "\nStep total time:\t%E" bash -ex {0}'
jobs:
actionlint:
runs-on: [self-hosted, linux, AMD64, X64, medium, noble]
steps:
- uses: actions/checkout@v4
- name: Download actionlint
if: ${{ !env.ACT }}
id: get_actionlint
run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
- name: Install prerequisites for actionlint
if: ${{ !env.ACT }}
run: |
sudo apt-get update
sudo apt-get install -y shellcheck
- name: Check workflow files
run: |
# use a local binary if env.ACT
${{ steps.get_actionlint.outputs.executable || 'actionlint' }} -color \
.github/workflows/multi-node-ha.yml
multi-node-ha:
name: Multi-node HA
runs-on: [self-hosted, linux, AMD64, X64, large, noble]
needs: actionlint
steps:
- uses: actions/checkout@v4
- name: Set env
run: |
if [ "$USE_WORKAROUND" = true ]; then
echo '::warning::Not a clean run. Some workarounds are going to be used.'
fi
case "$HARDWARE_PROFILE" in
minimal)
echo CPU=4 >> "$GITHUB_ENV"
echo MEMORY=20 >> "$GITHUB_ENV"
echo DISK=128 >> "$GITHUB_ENV"
echo EXTRA_DISK=128 >> "$GITHUB_ENV"
;;
minimal-with-cpu-overcommit)
echo CPU=16 >> "$GITHUB_ENV"
echo MEMORY=20 >> "$GITHUB_ENV"
echo DISK=128 >> "$GITHUB_ENV"
echo EXTRA_DISK=128 >> "$GITHUB_ENV"
;;
tutorial)
# https://canonical.com/microstack/docs/multi-node
echo CPU=4 >> "$GITHUB_ENV"
echo MEMORY=32 >> "$GITHUB_ENV"
echo DISK=250 >> "$GITHUB_ENV"
echo EXTRA_DISK=200 >> "$GITHUB_ENV"
;;
allowance)
echo CPU=16 >> "$GITHUB_ENV"
echo MEMORY=64 >> "$GITHUB_ENV"
echo DISK=512 >> "$GITHUB_ENV"
echo EXTRA_DISK=512 >> "$GITHUB_ENV"
;;
*)
echo '::error:: Invalid hardware profile'
exit 1
;;
esac
# FIXME: use $GITHUB_STEP_SUMMARY
# shellcheck source=/dev/null
source "$GITHUB_ENV"
echo "::notice::Selected hardware profile: ${HARDWARE_PROFILE}.%0ACPU: ${CPU}%0AMEMORY: ${MEMORY}%0ADISK: ${DISK}%0AEXTRA_DISK: ${EXTRA_DISK}"
- name: Check machine specs
run: |
# bare metal returns "none" with exit 1
systemd-detect-virt || true
cat /etc/os-release
lscpu
free -h
lsblk -e7
lsblk -e7 -f
if [ "$ACTIONS_STEP_DEBUG" = true ]; then
lsblk
lsblk -f
# IPv6 address can be sensitive
ip -br a
ip r
resolvectl --no-pager
fi
# TODO: check if greenfield or brownfield first
- name: Install prerequisites
if: ${{ !env.ACT }}
run: |
sudo apt-get update
sudo apt-get install -y uvtool j2cli
# make sure the default user is in the libvirt group.
# the "runner" user in Github workflow is not in the sudo
# group so it's not automatically added into the libvirt
# group.
sudo adduser "$USER" libvirt
- name: Download a VM image
if: ${{ !env.ACT }}
run: |
sudo -g libvirt uvt-simplestreams-libvirt sync release=noble arch=amd64
sudo -g libvirt uvt-simplestreams-libvirt query
- name: Prepare SSH, virtual network bridge
if: ${{ !env.ACT }}
run: |
# SSH
ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -N ''
tee -a ~/.ssh/config < .github/assets/workflows/ssh_config
# bridge
sudo -g libvirt virsh -c qemu:///system net-define .github/assets/workflows/sunbeam-virbr0.xml
sudo -g libvirt virsh -c qemu:///system net-autostart sunbeam-virbr0
sudo -g libvirt virsh -c qemu:///system net-start sunbeam-virbr0
- name: Clean up previous virtual machines
if: ${{ env.ACT }}
run: |
for i in {1..3}; do
# FIXME: the requirement of FQDN is not documented well in each tutorial
sudo -g libvirt uvt-kvm destroy "sunbeam-multi-node-ha-${i}.localdomain" || true
ssh-keygen -R "192.168.124.4${i}" || true
done
- name: Prepare virtual machines
run: |
for i in {1..3}; do
sudo -g libvirt uvt-kvm create \
--machine-type q35 \
--cpu "$CPU" \
--host-passthrough \
--memory "$((MEMORY * 1024))" \
--disk "$DISK" \
--ephemeral-disk "$EXTRA_DISK" \
--ephemeral-disk "$EXTRA_DISK" \
--unsafe-caching \
--bridge sunbeam-virbr0 \
--network-config /dev/stdin \
--ssh-public-key-file ~/.ssh/id_ed25519.pub \
--no-start \
"sunbeam-multi-node-ha-${i}.localdomain" \
release=noble <<EOF
network:
version: 2
ethernets:
enp1s0:
dhcp4: false
dhcp6: false
accept-ra: false
addresses:
- 192.168.124.4${i}/24
routes:
- to: default
via: 192.168.124.1
nameservers:
addresses:
- 192.168.124.1
EOF
done
# secondary NIC
for i in {1..3}; do
sudo -g libvirt virsh -c qemu:///system attach-interface "sunbeam-multi-node-ha-${i}.localdomain" \
network sunbeam-virbr0 \
--model virtio --config
done
# LP: #2095570
if [ "$USE_WORKAROUND" = true ]; then
echo '::warning::Workaround for https://launchpad.net/bugs/2095570'
for i in {1..3}; do
sudo -g libvirt virsh -c qemu:///system vol-create-as uvtool --format qcow2 \
"sunbeam-multi-node-ha-${i}-sata1.qcow" "$((EXTRA_DISK * 1024**3))"
sudo -g libvirt virsh -c qemu:///system attach-disk "sunbeam-multi-node-ha-${i}.localdomain" \
"/var/lib/uvtool/libvirt/images/sunbeam-multi-node-ha-${i}-sata1.qcow" \
sda --subdriver qcow2 --targetbus sata --config
done
fi
for i in {1..3}; do
sudo -g libvirt virsh -c qemu:///system start "sunbeam-multi-node-ha-${i}.localdomain"
done
for i in {1..3}; do
until ssh -oStrictHostKeyChecking=no "sunbeam-multi-node-ha-${i}" -- 'cloud-init status --wait; ip -br a; lsblk'; do
sleep 5
done
done
# LP: #2065911
if [ "$USE_WORKAROUND" = true ]; then
echo '::warning::Workaround for https://launchpad.net/bugs/2065911'
for i in {1..3}; do
ssh "sunbeam-multi-node-ha-${i}" -- 'sudo install -m 0600 /dev/stdin /etc/netplan/90-local-ovs-ext-port.yaml <<EOF
network:
version: 2
ethernets:
# LP: #2065911
enp9s0:
dhcp4: false
dhcp6: false
accept-ra: false
EOF
sudo netplan apply
'
done
fi
- name: Sunbeam - Prepare manifest file
run: |
if [ "$USE_WORKAROUND" = true ]; then
echo '::warning::Workaround for https://launchpad.net/bugs/2098163'
echo '::warning::Workaround for https://launchpad.net/bugs/2098438'
fi
j2 -f yaml -o ./manifest.yaml .github/assets/workflows/multi-node/manifest.yaml.j2 - <<EOF
use_workaround: $USE_WORKAROUND
dockerhub_mirror: $DOCKERHUB_MIRROR
EOF
scp ./manifest.yaml sunbeam-multi-node-ha-1:
- name: Sunbeam - Prepare the first machine
run: |
ssh sunbeam-multi-node-ha-1 -- sudo snap install openstack --channel 2024.1/edge
ssh sunbeam-multi-node-ha-1 -- 'sunbeam prepare-node-script --bootstrap | bash -x'
- name: Sunbeam - Bootstrap the cloud
run: |
# -t is necessary to see some progress in act env, LP:#2097451
# Also, without -t, somehow add-k8s command gets stuck in act env
# although it doesn't happen in GitHub runner.
# without -tt, GitHub runner's log should be quiet.
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster bootstrap --manifest manifest.yaml \
--role control,compute,storage
- name: Workaround - destroy localhost controller
if: ${{ env.USE_WORKAROUND }}
run: |
# LP: #2095487, without doing it, a random /24 range will be
# dead from a user perspective and the OpenStack API cannot
# return a response to the range since it will go to the
# unused network bridge.
echo '::warning::Workaround for https://launchpad.net/bugs/2095487'
ssh sunbeam-multi-node-ha-1 -- juju destroy-controller localhost-localhost --no-prompt
ssh sunbeam-multi-node-ha-1 -- lxc profile device remove default eth0
ssh sunbeam-multi-node-ha-1 -- lxc network delete sunbeambr0
- name: Workaround - enable debug logging
if: ${{ env.USE_WORKAROUND }}
run: |
# LP: #2065490
ssh sunbeam-multi-node-ha-1 -- 'juju model-config -m admin/openstack-machines logging-config="<root>=INFO;unit=DEBUG"'
- name: Workaround - reset PG num
if: ${{ env.USE_WORKAROUND }}
run: |
# LP: #2096923
echo '::warning::Workaround for https://launchpad.net/bugs/2096923'
ssh sunbeam-multi-node-ha-1 -- '
set -ex
sudo microceph status
sunbeam cluster list
sudo ceph status
sudo ceph health detail
sudo ceph osd pool autoscale-status
sudo ceph config set global osd_pool_default_pg_autoscale_mode warn
sudo ceph osd pool ls | xargs -t -I{} sudo ceph osd pool set {} pg_autoscale_mode warn
sudo ceph osd pool set glance pg_num 32
sudo ceph osd pool set cinder-ceph pg_num 32
'
- name: Sunbeam - Create registration tokens for the second and the third machines
run: |
for i in {2..3}; do
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster add "sunbeam-multi-node-ha-${i}.localdomain" --output "sunbeam-multi-node-ha-${i}.asc"
done
- name: Sunbeam - Prepare the second and the third machines
run: |
for i in {2..3}; do
ssh "sunbeam-multi-node-ha-${i}" -- sudo snap install openstack --channel 2024.1/edge
ssh "sunbeam-multi-node-ha-${i}" -- 'sunbeam prepare-node-script | bash -x'
done
- name: Sunbeam - Add the second and the third machines
run: |
for i in {2..3}; do
scp sunbeam-multi-node-ha-1:"sunbeam-multi-node-ha-${i}.asc" "sunbeam-multi-node-ha-${i}:"
ssh "sunbeam-multi-node-ha-${i}" -t -- "cat 'sunbeam-multi-node-ha-${i}.asc' | sunbeam cluster join --role control,compute,storage -"
done
- name: Workaround - check Ceph status before moving on
if: ${{ env.USE_WORKAROUND }}
run: |
# LP: #2095570
echo '::warning::Workaround for https://launchpad.net/bugs/2095570'
ssh sunbeam-multi-node-ha-1 -- '
set -ex
sudo microceph status
sunbeam cluster list
sudo ceph status
sudo ceph health detail
sudo ceph osd pool autoscale-status
sudo ceph osd tree
# bail out when OSD hosts are not added
[ "$(sudo ceph osd tree --format json | jq '\''.nodes[] | select(.type=="host")'\'' | jq -s length)" -ge 3 ]
'
- name: Sunbeam - Resize the control plane
run: |
# LP: #2065469
if [ "$USE_WORKAROUND" = true ]; then
echo '::warning::Workaround for https://launchpad.net/bugs/2065469'
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize \
|| \
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize
else
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize
fi
- name: Sunbeam - Configure the cloud
run: |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam configure --openrc demo-openrc
- name: Sunbeam - Launch a VM
run: |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam launch ubuntu --name test
- name: Sunbeam - Connect to the VM
run: |
# The cloud-init process inside the VM takes ~2 minutes to bring up the
# SSH service after the VM gets ACTIVE in OpenStack
sleep 5m
ssh sunbeam-multi-node-ha-1 -- '
set -ex
source demo-openrc
demo_floating_ip="$(openstack floating ip list -c Floating\ IP\ Address -f value | head -n1)"
ssh -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ~/snap/openstack/current/sunbeam "ubuntu@${demo_floating_ip}" -- cloud-init status --wait
'
- name: Save admin-openrc
run: |
for i in {1..3}; do
ssh "sunbeam-multi-node-ha-${i}" -t -- 'sunbeam openrc > admin-openrc'
done
- name: Smoke testing - Host reboots
run: |
for i in {1..3}; do
ssh "sunbeam-multi-node-ha-${i}" -- sudo reboot
# wait some time to settle
sleep 15m
done
ssh sunbeam-multi-node-ha-1 -t -- '
set -ex
uptime -p
sunbeam launch ubuntu --name test-after-reboot
sleep 5m
source demo-openrc
ssh -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ~/snap/openstack/current/sunbeam -l ubuntu \
"$(openstack server show test-after-reboot --format yaml | grep -E -o 192\.168\.124\.[0-9]+)" -- cloud-init status --wait
'
# TODO:
#- name: Smoke HA testing
- uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: manifest.yaml
path: ./manifest.yaml