Skip to content

Commit a3f600f

Browse files
authored
Merge pull request #1051 from stackhpc/2023.1-zed-merge
2023.1: zed merge
2 parents 5485a0f + 501a89e commit a3f600f

7 files changed

+75
-1
lines changed

etc/kayobe/ansible/pulp-host-image-promote.yml

+7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
name: "{{ repository_name }}_{{ promotion_tag }}"
2020
base_path: "{{ base_path }}/{{ promotion_tag }}"
2121
register: distribution_details
22+
until: distribution_details is success
23+
retries: 3
24+
delay: 5
2225

2326
- name: Fail if the image does not exist
2427
fail:
@@ -34,6 +37,10 @@
3437
base_path: "{{ base_path }}/{{ promotion_tag }}"
3538
content_guard: release
3639
state: present
40+
register: content_guard_result
41+
until: content_guard_result is success
42+
retries: 3
43+
delay: 5
3744

3845
- name: Print version tag and os
3946
debug:

etc/kayobe/ansible/pulp-host-image-upload.yml

+26
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
password: "{{ remote_pulp_password }}"
2626
file: "{{ found_files.files[0].path }}"
2727
state: present
28+
register: upload_result
29+
until: upload_result is success
30+
retries: 3
31+
delay: 60
2832

2933
- name: Get sha256 hash
3034
ansible.builtin.stat:
@@ -40,6 +44,10 @@
4044
sha256: "{{ file_stats.stat.checksum }}"
4145
relative_path: "{{ found_files.files[0].path | basename }}"
4246
state: present
47+
register: file_content_result
48+
until: file_content_result is success
49+
retries: 3
50+
delay: 5
4351

4452
- name: Ensure file repo exists
4553
pulp.squeezer.file_repository:
@@ -48,6 +56,10 @@
4856
password: "{{ remote_pulp_password }}"
4957
name: "{{ repository_name }}"
5058
state: present
59+
register: file_repo_result
60+
until: file_repo_result is success
61+
retries: 3
62+
delay: 5
5163

5264
- name: Add content to file repo
5365
pulp.squeezer.file_repository_content:
@@ -58,6 +70,10 @@
5870
present_content:
5971
- relative_path: "{{ found_files.files[0].path | basename }}"
6072
sha256: "{{ file_stats.stat.checksum }}"
73+
register: file_repo_content_result
74+
until: file_repo_content_result is success
75+
retries: 3
76+
delay: 5
6177

6278
- name: Create a new publication to point to this version
6379
pulp.squeezer.file_publication:
@@ -67,6 +83,9 @@
6783
repository: "{{ repository_name }}"
6884
state: present
6985
register: publication_details
86+
until: publication_details is success
87+
retries: 3
88+
delay: 5
7089

7190
- name: Update distribution for latest version
7291
pulp.squeezer.file_distribution:
@@ -79,6 +98,9 @@
7998
content_guard: development
8099
state: present
81100
register: latest_distribution_details
101+
until: latest_distribution_details is success
102+
retries: 3
103+
delay: 5
82104

83105
- name: Create distribution for given version
84106
pulp.squeezer.file_distribution:
@@ -91,6 +113,10 @@
91113
content_guard: development
92114
state: present
93115
when: latest_distribution_details.changed
116+
register: distribution_result
117+
until: distribution_result is success
118+
retries: 3
119+
delay: 5
94120

95121
- name: Update new images file with versioned path
96122
lineinfile:

etc/kayobe/kolla.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ kolla_build_customizations: "{{ kolla_build_customizations_common | combine(koll
367367

368368
# Dict mapping Kolla Dockerfile ARG names to their values.
369369
kolla_build_args:
370-
node_exporter_version: "1.5.0" # kolla has 1.4.0
370+
node_exporter_version: "1.5.0" # kolla has 1.4.0
371371
node_exporter_sha256sum: "af999fd31ab54ed3a34b9f0b10c28e9acee9ef5ac5a5d5edfdde85437db7acbb"
372372

373373
###############################################################################

etc/kayobe/kolla/config/prometheus/system.rules

+24
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,30 @@ groups:
9696
summary: Host clock not synchronising (instance {{ $labels.instance }})
9797
description: "Clock not synchronising. Ensure NTP is configured on this host."
9898

99+
- alert: HostNetworkBondDegraded
100+
expr: (node_bonding_active - node_bonding_slaves) != 0
101+
for: 2m
102+
labels:
103+
severity: warning
104+
annotations:
105+
summary: Host network bond degraded (instance {{ $labels.instance }})
106+
description: "Bond {{ $labels.master }} degraded on {{ $labels.instance }}"
107+
{% endraw %}
108+
109+
{% if alertmanager_warn_network_bond_single_link | bool %}
110+
{% raw %}
111+
- alert: HostNetworkBondSingleLink
112+
expr: node_bonding_slaves == 1
113+
for: 2m
114+
labels:
115+
severity: warning
116+
annotations:
117+
summary: Host network bond with a single link (instance {{ $labels.instance }})
118+
description: "Bond {{ $labels.master }} configured with a single link on {{ $labels.instance }}"
119+
{% endraw %}
120+
{% endif %}
121+
122+
{% raw %}
99123
- alert: HostConntrackLimit
100124
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
101125
for: 5m

etc/kayobe/stackhpc-monitoring.yml

+4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
# of free memory is lower than this value an alert will be triggered.
99
alertmanager_low_memory_threshold_gib: 5
1010

11+
# Whether to raise an alert if any network bond is configured with a single
12+
# link. Change to false to disable this alert.
13+
alertmanager_warn_network_bond_single_link: true
14+
1115
###############################################################################
1216
# Exporter configuration
1317

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
features:
3+
- |
4+
Adds a new Prometheus alert ``HostNetworkBondDegraded`` which will be
5+
raised when at least one bond member is down.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
features:
3+
- |
4+
Adds a new Prometheus alert ``HostNetworkBondSingleLink`` which will be
5+
raised when a bond is configured with only one member. This can happen when
6+
NetworkManager detects that a bond member is down at boot time. This alert
7+
can be disabled by setting ``alertmanager_warn_network_bond_single_link``
8+
to ``false``.

0 commit comments

Comments
 (0)