try another revision #3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Sunbeam Multi-node HA | |
#on: | |
#push: | |
# branches: ["act"] | |
# workflow_dispatch: | |
# inputs: | |
# use_workaround: | |
# description: 'Apply a workaround' | |
# required: true | |
# default: true | |
# type: boolean | |
# hardware_profile: | |
# description: 'Specs for each machine' | |
# required: true | |
# default: tutorial | |
# type: choice | |
# options: | |
# - minimal | |
# - minimal-with-cpu-overcommit | |
# - tutorial | |
# - allowance | |
permissions: | |
contents: read | |
env: | |
COLUMNS: 160 # default: 80 | |
DEBIAN_FRONTEND: noninteractive | |
# github.event.inputs.use_workaround returns a string | |
# - non-empty 'true' or 'false' from the workflow_dispatch | |
# - '' from the on-push event | |
# set the default value as true only when there is no input | |
USE_WORKAROUND: ${{ github.event.inputs.use_workaround || true }} | |
HARDWARE_PROFILE: ${{ inputs.hardware_profile || 'tutorial' }} | |
defaults: | |
run: | |
# act doesn't show a good summary of elapsed time of each step, and | |
# having an unnecessary `bash -e -c` at the beginning is for | |
# actionlint to activate shellcheck. | |
shell: bash -e -c '/usr/bin/time -f "\nStep total time:\t%E" bash -ex {0}' | |
jobs: | |
actionlint: | |
runs-on: [self-hosted, linux, AMD64, X64, medium, noble] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Download actionlint | |
if: ${{ !env.ACT }} | |
id: get_actionlint | |
run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) | |
- name: Install prerequisites for actionlint | |
if: ${{ !env.ACT }} | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y shellcheck | |
- name: Check workflow files | |
run: | | |
# use a local binary if env.ACT | |
${{ steps.get_actionlint.outputs.executable || 'actionlint' }} -color \ | |
.github/workflows/multi-node-ha.yml | |
multi-node-ha: | |
name: Multi-node HA | |
runs-on: [self-hosted, linux, AMD64, X64, large, noble] | |
needs: actionlint | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set env | |
run: | | |
if [ "$USE_WORKAROUND" = true ]; then | |
echo '::warning::Not a clean run. Some workarounds are going to be used.' | |
fi | |
case "$HARDWARE_PROFILE" in | |
minimal) | |
echo CPU=4 >> "$GITHUB_ENV" | |
echo MEMORY=20 >> "$GITHUB_ENV" | |
echo DISK=128 >> "$GITHUB_ENV" | |
echo EXTRA_DISK=128 >> "$GITHUB_ENV" | |
;; | |
minimal-with-cpu-overcommit) | |
echo CPU=16 >> "$GITHUB_ENV" | |
echo MEMORY=20 >> "$GITHUB_ENV" | |
echo DISK=128 >> "$GITHUB_ENV" | |
echo EXTRA_DISK=128 >> "$GITHUB_ENV" | |
;; | |
tutorial) | |
# https://canonical.com/microstack/docs/multi-node | |
echo CPU=4 >> "$GITHUB_ENV" | |
echo MEMORY=32 >> "$GITHUB_ENV" | |
echo DISK=250 >> "$GITHUB_ENV" | |
echo EXTRA_DISK=200 >> "$GITHUB_ENV" | |
;; | |
allowance) | |
echo CPU=16 >> "$GITHUB_ENV" | |
echo MEMORY=64 >> "$GITHUB_ENV" | |
echo DISK=512 >> "$GITHUB_ENV" | |
echo EXTRA_DISK=512 >> "$GITHUB_ENV" | |
;; | |
*) | |
echo '::error:: Invalid hardware profile' | |
exit 1 | |
;; | |
esac | |
# FIXME: use $GITHUB_STEP_SUMMARY | |
# shellcheck source=/dev/null | |
source "$GITHUB_ENV" | |
echo "::notice::Selected hardware profile: ${HARDWARE_PROFILE}.%0ACPU: ${CPU}%0AMEMORY: ${MEMORY}%0ADISK: ${DISK}%0AEXTRA_DISK: ${EXTRA_DISK}" | |
- name: Check machine specs | |
run: | | |
# bare metal returns "none" with exit 1 | |
systemd-detect-virt || true | |
cat /etc/os-release | |
lscpu | |
free -h | |
lsblk -e7 | |
lsblk -e7 -f | |
if [ "$ACTIONS_STEP_DEBUG" = true ]; then | |
lsblk | |
lsblk -f | |
# IPv6 address can be sensitive | |
ip -br a | |
ip r | |
resolvectl --no-pager | |
fi | |
# TODO: check if greenfield or brownfield first | |
- name: Install prerequisites | |
if: ${{ !env.ACT }} | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y uvtool j2cli | |
# make sure the default user is in the libvirt group. | |
# the "runner" user in Github workflow is not in the sudo | |
# group so it's not automatically added into the libvirt | |
# group. | |
sudo adduser "$USER" libvirt | |
- name: Download a VM image | |
if: ${{ !env.ACT }} | |
run: | | |
sudo -g libvirt uvt-simplestreams-libvirt sync release=noble arch=amd64 | |
sudo -g libvirt uvt-simplestreams-libvirt query | |
- name: Prepare SSH, virtual network bridge | |
if: ${{ !env.ACT }} | |
run: | | |
# SSH | |
ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -N '' | |
tee -a ~/.ssh/config < .github/assets/workflows/ssh_config | |
# bridge | |
sudo -g libvirt virsh -c qemu:///system net-define .github/assets/workflows/sunbeam-virbr0.xml | |
sudo -g libvirt virsh -c qemu:///system net-autostart sunbeam-virbr0 | |
sudo -g libvirt virsh -c qemu:///system net-start sunbeam-virbr0 | |
- name: Clean up previous virtual machines | |
if: ${{ env.ACT }} | |
run: | | |
for i in {1..3}; do | |
# FIXME: the requirement of FQDN is not documented well in each tutorial | |
sudo -g libvirt uvt-kvm destroy "sunbeam-multi-node-ha-${i}.localdomain" || true | |
ssh-keygen -R "192.168.124.4${i}" || true | |
done | |
- name: Prepare virtual machines | |
run: | | |
for i in {1..3}; do | |
sudo -g libvirt uvt-kvm create \ | |
--machine-type q35 \ | |
--cpu "$CPU" \ | |
--host-passthrough \ | |
--memory "$((MEMORY * 1024))" \ | |
--disk "$DISK" \ | |
--ephemeral-disk "$EXTRA_DISK" \ | |
--ephemeral-disk "$EXTRA_DISK" \ | |
--unsafe-caching \ | |
--bridge sunbeam-virbr0 \ | |
--network-config /dev/stdin \ | |
--ssh-public-key-file ~/.ssh/id_ed25519.pub \ | |
--no-start \ | |
"sunbeam-multi-node-ha-${i}.localdomain" \ | |
release=noble <<EOF | |
network: | |
version: 2 | |
ethernets: | |
enp1s0: | |
dhcp4: false | |
dhcp6: false | |
accept-ra: false | |
addresses: | |
- 192.168.124.4${i}/24 | |
routes: | |
- to: default | |
via: 192.168.124.1 | |
nameservers: | |
addresses: | |
- 192.168.124.1 | |
EOF | |
done | |
# secondary NIC | |
for i in {1..3}; do | |
sudo -g libvirt virsh -c qemu:///system attach-interface "sunbeam-multi-node-ha-${i}.localdomain" \ | |
network sunbeam-virbr0 \ | |
--model virtio --config | |
done | |
# LP: #2095570 | |
if [ "$USE_WORKAROUND" = true ]; then | |
echo '::warning::Workaround for https://launchpad.net/bugs/2095570' | |
for i in {1..3}; do | |
sudo -g libvirt virsh -c qemu:///system vol-create-as uvtool --format qcow2 \ | |
"sunbeam-multi-node-ha-${i}-sata1.qcow" "$((EXTRA_DISK * 1024**3))" | |
sudo -g libvirt virsh -c qemu:///system attach-disk "sunbeam-multi-node-ha-${i}.localdomain" \ | |
"/var/lib/uvtool/libvirt/images/sunbeam-multi-node-ha-${i}-sata1.qcow" \ | |
sda --subdriver qcow2 --targetbus sata --config | |
done | |
fi | |
for i in {1..3}; do | |
sudo -g libvirt virsh -c qemu:///system start "sunbeam-multi-node-ha-${i}.localdomain" | |
done | |
for i in {1..3}; do | |
until ssh -oStrictHostKeyChecking=no "sunbeam-multi-node-ha-${i}" -- 'cloud-init status --wait; ip -br a; lsblk'; do | |
sleep 5 | |
done | |
done | |
# LP: #2065911 | |
if [ "$USE_WORKAROUND" = true ]; then | |
echo '::warning::Workaround for https://launchpad.net/bugs/2065911' | |
for i in {1..3}; do | |
ssh "sunbeam-multi-node-ha-${i}" -- 'sudo install -m 0600 /dev/stdin /etc/netplan/90-local-ovs-ext-port.yaml <<EOF | |
network: | |
version: 2 | |
ethernets: | |
# LP: #2065911 | |
enp9s0: | |
dhcp4: false | |
dhcp6: false | |
accept-ra: false | |
EOF | |
sudo netplan apply | |
' | |
done | |
fi | |
- name: Sunbeam - Prepare manifest file | |
run: | | |
if [ "$USE_WORKAROUND" = true ]; then | |
echo '::warning::Workaround for https://launchpad.net/bugs/2098163' | |
echo '::warning::Workaround for https://launchpad.net/bugs/2098438' | |
fi | |
j2 -f yaml -o ./manifest.yaml .github/assets/workflows/multi-node/manifest.yaml.j2 - <<EOF | |
use_workaround: $USE_WORKAROUND | |
dockerhub_mirror: $DOCKERHUB_MIRROR | |
EOF | |
scp ./manifest.yaml sunbeam-multi-node-ha-1: | |
- name: Sunbeam - Prepare the first machine | |
run: | | |
ssh sunbeam-multi-node-ha-1 -- sudo snap install openstack --channel 2024.1/edge | |
ssh sunbeam-multi-node-ha-1 -- 'sunbeam prepare-node-script --bootstrap | bash -x' | |
- name: Sunbeam - Bootstrap the cloud | |
run: | | |
# -t is necessary to see some progress in act env, LP:#2097451 | |
# Also, without -t, somehow add-k8s command gets stuck in act env | |
# although it doesn't happen in GitHub runner. | |
# without -tt, GitHub runner's log should be quiet. | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster bootstrap --manifest manifest.yaml \ | |
--role control,compute,storage | |
- name: Workaround - destroy localhost controller | |
if: ${{ env.USE_WORKAROUND }} | |
run: | | |
# LP: #2095487, without doing it, a random /24 range will be | |
# dead from a user perspective and the OpenStack API cannot | |
# return a response to the range since it will go to the | |
# unused network bridge. | |
echo '::warning::Workaround for https://launchpad.net/bugs/2095487' | |
ssh sunbeam-multi-node-ha-1 -- juju destroy-controller localhost-localhost --no-prompt | |
ssh sunbeam-multi-node-ha-1 -- lxc profile device remove default eth0 | |
ssh sunbeam-multi-node-ha-1 -- lxc network delete sunbeambr0 | |
- name: Workaround - enable debug logging | |
if: ${{ env.USE_WORKAROUND }} | |
run: | | |
# LP: #2065490 | |
ssh sunbeam-multi-node-ha-1 -- 'juju model-config -m admin/openstack-machines logging-config="<root>=INFO;unit=DEBUG"' | |
- name: Workaround - reset PG num | |
if: ${{ env.USE_WORKAROUND }} | |
run: | | |
# LP: #2096923 | |
echo '::warning::Workaround for https://launchpad.net/bugs/2096923' | |
ssh sunbeam-multi-node-ha-1 -- ' | |
set -ex | |
sudo microceph status | |
sunbeam cluster list | |
sudo ceph status | |
sudo ceph health detail | |
sudo ceph osd pool autoscale-status | |
sudo ceph config set global osd_pool_default_pg_autoscale_mode warn | |
sudo ceph osd pool ls | xargs -t -I{} sudo ceph osd pool set {} pg_autoscale_mode warn | |
sudo ceph osd pool set glance pg_num 32 | |
sudo ceph osd pool set cinder-ceph pg_num 32 | |
' | |
- name: Sunbeam - Create registration tokens for the second and the third machines | |
run: | | |
for i in {2..3}; do | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster add "sunbeam-multi-node-ha-${i}.localdomain" --output "sunbeam-multi-node-ha-${i}.asc" | |
done | |
- name: Sunbeam - Prepare the second and the third machines | |
run: | | |
for i in {2..3}; do | |
ssh "sunbeam-multi-node-ha-${i}" -- sudo snap install openstack --channel 2024.1/edge | |
ssh "sunbeam-multi-node-ha-${i}" -- 'sunbeam prepare-node-script | bash -x' | |
done | |
- name: Sunbeam - Add the second and the third machines | |
run: | | |
for i in {2..3}; do | |
scp sunbeam-multi-node-ha-1:"sunbeam-multi-node-ha-${i}.asc" "sunbeam-multi-node-ha-${i}:" | |
ssh "sunbeam-multi-node-ha-${i}" -t -- "cat 'sunbeam-multi-node-ha-${i}.asc' | sunbeam cluster join --role control,compute,storage -" | |
done | |
- name: Workaround - check Ceph status before moving on | |
if: ${{ env.USE_WORKAROUND }} | |
run: | | |
# LP: #2095570 | |
echo '::warning::Workaround for https://launchpad.net/bugs/2095570' | |
ssh sunbeam-multi-node-ha-1 -- ' | |
set -ex | |
sudo microceph status | |
sunbeam cluster list | |
sudo ceph status | |
sudo ceph health detail | |
sudo ceph osd pool autoscale-status | |
sudo ceph osd tree | |
# bail out when OSD hosts are not added | |
[ "$(sudo ceph osd tree --format json | jq '\''.nodes[] | select(.type=="host")'\'' | jq -s length)" -ge 3 ] | |
' | |
- name: Sunbeam - Resize the control plane | |
run: | | |
# LP: #2065469 | |
if [ "$USE_WORKAROUND" = true ]; then | |
echo '::warning::Workaround for https://launchpad.net/bugs/2065469' | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize \ | |
|| \ | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize | |
else | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam cluster resize | |
fi | |
- name: Sunbeam - Configure the cloud | |
run: | | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam configure --openrc demo-openrc | |
- name: Sunbeam - Launch a VM | |
run: | | |
ssh sunbeam-multi-node-ha-1 -t -- sunbeam launch ubuntu --name test | |
- name: Sunbeam - Connect to the VM | |
run: | | |
# The cloud-init process inside the VM takes ~2 minutes to bring up the | |
# SSH service after the VM gets ACTIVE in OpenStack | |
sleep 5m | |
ssh sunbeam-multi-node-ha-1 -- ' | |
set -ex | |
source demo-openrc | |
demo_floating_ip="$(openstack floating ip list -c Floating\ IP\ Address -f value | head -n1)" | |
ssh -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ~/snap/openstack/current/sunbeam "ubuntu@${demo_floating_ip}" -- cloud-init status --wait | |
' | |
- name: Save admin-openrc | |
run: | | |
for i in {1..3}; do | |
ssh "sunbeam-multi-node-ha-${i}" -t -- 'sunbeam openrc > admin-openrc' | |
done | |
- name: Smoke testing - Host reboots | |
run: | | |
for i in {1..3}; do | |
ssh "sunbeam-multi-node-ha-${i}" -- sudo reboot | |
# wait some time to settle | |
sleep 15m | |
done | |
ssh sunbeam-multi-node-ha-1 -t -- ' | |
set -ex | |
uptime -p | |
sunbeam launch ubuntu --name test-after-reboot | |
sleep 5m | |
source demo-openrc | |
ssh -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ~/snap/openstack/current/sunbeam -l ubuntu \ | |
"$(openstack server show test-after-reboot --format yaml | grep -E -o 192\.168\.124\.[0-9]+)" -- cloud-init status --wait | |
' | |
# TODO: | |
#- name: Smoke HA testing | |
- uses: actions/upload-artifact@v4 | |
if: ${{ always() }} | |
with: | |
name: manifest.yaml | |
path: ./manifest.yaml |