This repository has been archived by the owner on Nov 2, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTaskfile.yaml
309 lines (281 loc) · 18.9 KB
/
Taskfile.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
---
# yaml-language-server: $schema=https://taskfile.dev/schema.json
version: "3"
vars:
CLUSTER_DIR: "{{.ROOT_DIR}}/kubernetes"
includes:
lint: .taskfiles/Lint/Tasks.yaml
volsync: .taskfiles/VolSync/Tasks.yaml
tasks:
default:
silent: true
cmds:
- "sh -c 'go-task -l || task -l'"
# yamllint disable rule:line-length
'bootstrap:stage1':
desc: >-
Install core manifests for CNI, storage, monitoring and GitOps (ex. task bootstrap cluster=deedee)
vars:
cluster: '{{ or .cluster (fail "`cluster` is required") }}'
cmds:
# necessary CRDs
- kubectl apply -f https://raw.githubusercontent.com/giantswarm/silence-operator/master/config/crd/monitoring.giantswarm.io_silences.yaml
- kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/main/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
- kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/main/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
- kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/main/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/groupsnapshot.storage.k8s.io_volumegroupsnapshotclasses.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/groupsnapshot.storage.k8s.io_volumegroupsnapshotcontents.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/groupsnapshot.storage.k8s.io_volumegroupsnapshots.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/snapshot.storage.k8s.io_volumesnapshotclasses.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/snapshot.storage.k8s.io_volumesnapshotcontents.yaml
- kubectl apply -f https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/master/client/config/crd/snapshot.storage.k8s.io_volumesnapshots.yaml
# apps
- helm dependency build kubernetes/apps/kube-system/coredns/
- helm template -n kube-system -f talos/{{ .cluster }}/values.bootstrap.yaml coredns kubernetes/apps/kube-system/coredns/ | kubectl apply -n kube-system -f -
- helm dependency build kubernetes/apps/kube-system/cilium/
# warnings about missing resource CRDs are fine here
- helm template -n kube-system -f talos/{{ .cluster }}/values.bootstrap.yaml cilium kubernetes/apps/kube-system/cilium/ | kubectl apply -n kube-system -f -
- kubectl -n kube-system rollout status daemonset cilium --timeout=1800s
- kubectl -n kube-system rollout status deployment coredns --timeout=1800s
- kubectl create namespace tools
- kubectl label --overwrite namespace tools pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=privileged pod-security.kubernetes.io/audit=privileged
- helm dependency build kubernetes/apps/tools/kubelet-csr-approver/
- helm template -n tools kubelet-csr-approver kubernetes/apps/tools/kubelet-csr-approver/ | kubectl apply -n tools -f -
- kubectl -n tools rollout status deployment kubelet-csr-approver --timeout=1800s
- helm dependency build kubernetes/apps/tools/snapshot-controller/
- helm template -n tools snapshot-controller kubernetes/apps/tools/snapshot-controller/ | kubectl apply -n tools -f -
- kubectl -n tools rollout status deployment snapshot-controller --timeout=1800s
- kubectl -n tools rollout status deployment snapshot-validation-webhook --timeout=1800s
- kubectl create namespace rook-ceph
- kubectl label --overwrite namespace rook-ceph pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=privileged pod-security.kubernetes.io/audit=privileged
- helm dependency build kubernetes/apps/rook-ceph/rook-ceph/
- helm template -n rook-ceph rook-ceph kubernetes/apps/rook-ceph/rook-ceph/ | kubectl apply -n rook-ceph -f -
- kubectl -n rook-ceph rollout status deployment rook-ceph-operator --timeout=1800s
# wait for discover pods to spawn
- sleep 10
- kubectl -n rook-ceph rollout status daemonset rook-discover --timeout=1800s
- helm dependency build kubernetes/apps/rook-ceph/rook-ceph-cluster/
- helm template -n rook-ceph -f talos/{{ .cluster }}/values.bootstrap.yaml rook-ceph-cluster kubernetes/apps/rook-ceph/rook-ceph-cluster/ | kubectl apply -n rook-ceph -f -
- kubectl -n rook-ceph wait --timeout=1800s --for=jsonpath='{.status.message}=Cluster created successfully' cephcluster rook-ceph
- kubectl -n rook-ceph wait --timeout=1800s --for=jsonpath='{.status.phase}=Ready' cephcluster rook-ceph
- kubectl -n rook-ceph wait --timeout=1800s --for=jsonpath='{.status.ceph.health}=HEALTH_OK' cephcluster rook-ceph
- helm dependency build kubernetes/apps/tools/secrets-store-csi-driver/
- helm template -n tools secrets-store-csi-driver kubernetes/apps/tools/secrets-store-csi-driver/ | kubectl apply -n tools -f -
- kubectl -n tools rollout status daemonset secrets-store-csi-driver --timeout=1800s
- helm dependency build kubernetes/apps/argocd/argocd/
- kubectl create namespace argocd
# ignore ingress complaints - it's fine at this point
- helm template -n argocd -f talos/{{ .cluster }}/values.bootstrap.yaml argocd kubernetes/apps/argocd/argocd/ | kubectl apply -n argocd -f -
- kubectl -n argocd rollout status deployment argocd-redis --timeout=1800s
- kubectl -n argocd rollout status deployment argocd-server --timeout=1800s
- kubectl -n argocd rollout status deployment argocd-repo-server --timeout=1800s
- kubectl create namespace workers
- kubectl label --overwrite namespace workers pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=privileged pod-security.kubernetes.io/audit=privileged
- helm dependency build kubernetes/apps/workers/volsync/
- helm template -n workers -f talos/{{ .cluster }}/values.bootstrap.yaml volsync kubernetes/apps/workers/volsync/ | kubectl apply -n workers -f -
- kubectl -n workers rollout status deployment volsync --timeout=1800s
- helm dependency build kubernetes/apps/tools/vault/
- helm template -n tools -f talos/{{ .cluster }}/values.bootstrap.yaml vault kubernetes/apps/tools/vault/ | kubectl apply -n tools -f -
- kubectl -n tools rollout status statefulset vault --timeout=1800s
# ArgoCD can kick in here
- kubectl apply -f kubernetes/apps/tools/system-upgrade-controller/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application system-upgrade-controller
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application system-upgrade-controller
- sleep 10
- kubectl -n tools wait --for=condition=Complete --timeout=3600s plan talos
- kubectl -n tools wait --for=condition=Complete --timeout=3600s plan kubernetes
- kubectl wait --for=condition=Ready nodes --all --timeout=3600s
- echo -e '\\n\\nENSURE SYSTEM-UPGRADE-CONTROLLER DID THE INITIAL SELF-REFRESH UPGRADE\\n - all nodes should have plan.upgrade.cattle.io/talos and control-plane nodes should have plan.upgrade.cattle.io/kubernetes annotations\\n- ensure all pods are running\\n'
- read test
- kubectl apply -f kubernetes/apps/tools/spegel/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application spegel
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application spegel
- kubectl apply -f kubernetes/apps/kube-system/metrics-server/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application metrics-server
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application metrics-server
- kubectl apply -f kubernetes/apps/tools/secrets-store-csi-driver/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application secrets-store-csi-driver
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application secrets-store-csi-driver
- kubectl apply -f kubernetes/apps/tools/snapshot-controller/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application snapshot-controller
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application snapshot-controller
- kubectl apply -f kubernetes/apps/kube-system/coredns/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application coredns
- kubectl rollout restart deployment coredns -n kube-system
- kubectl -n kube-system rollout status deployment coredns --timeout=1800s
- kubectl apply -f kubernetes/apps/kube-system/cilium/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application cilium
- kubectl rollout restart daemonset cilium -n kube-system
- kubectl -n kube-system rollout status daemonset cilium --timeout=1800s
- kubectl apply -f kubernetes/apps/cert-manager/cert-manager/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application cert-manager
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application cert-manager
- kubectl apply -f kubernetes/apps/networking/ingress-nginx/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application ingress-nginx-internal
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application ingress-nginx-internal
# external ingress is necessary for argo webhook
- kubectl apply -f kubernetes/apps/external/ingress-nginx/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application ingress-nginx-external
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application ingress-nginx-external
# cilium will become healthly after ingress, because of hubble
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application cilium
- kubectl apply -f kubernetes/apps/argocd/argocd/application.yaml
# wait for reapply
- sleep 2
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application argocd
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application argocd
- kubectl apply -f kubernetes/apps/workers/volsync/application.yaml
# wait for reapply
- sleep 2
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application volsync
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application volsync
- kubectl apply -f kubernetes/apps/rook-ceph/rook-ceph/application.yaml
# wait for reapply
- sleep 2
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application rook-ceph
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application rook-ceph
- kubectl apply -f kubernetes/apps/rook-ceph/rook-ceph-cluster/application.yaml
# wait for reapply
- sleep 2
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application rook-ceph-cluster
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application rook-ceph-cluster
- kubectl apply -f kubernetes/apps/tools/vault/application.yaml
# wait for reapply
- sleep 2
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application vault
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application vault
- kubectl apply -f kubernetes/apps/tools/reloader/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application reloader
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application reloader
- kubectl apply -f kubernetes/apps/monitoring/silence-operator/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application silence-operator
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application silence-operator
- kubectl apply -f kubernetes/apps/networking/smtp-relay/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application smtp-relay
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application smtp-relay
- kubectl apply -f kubernetes/apps/monitoring/kube-prometheus-stack/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application kube-prometheus-stack
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application kube-prometheus-stack
- kubectl apply -f kubernetes/apps/database/redis-ha/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application redis-ha
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application redis-ha
- echo -e '\\n\\nADJUST kubernets/apps/database/cloudnative-pg/templates/cluster.yaml\\nBump postgres-vX both for currentCluster and previousCluster by 1\\nThen push change to the repo before processing further!\\n\\nPress ENTER when done'
- read test
- kubectl apply -f kubernetes/apps/database/cloudnative-pg/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application cloudnative-pg
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application cloudnative-pg
- kubectl -n database wait --timeout=1800s --for=jsonpath='{.status.phase}="Cluster in healthy state"' cluster postgres-16
- kubectl apply -f kubernetes/apps/default/lldap/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application lldap
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application lldap
- kubectl apply -f kubernetes/apps/default/authelia/application.yaml
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.sync.status}=Synced' application authelia
- kubectl -n argocd wait --timeout=1800s --for=jsonpath='{.status.health.status}=Healthy' application authelia
# yamllint enable rule:line-length
mount-volume:
desc: >-
Mount a PersistentVolumeClaim to a temporary pod (ex. task mount-volume claim=data-vault-0 [namespace=default])
interactive: true
vars:
claim: '{{ or .claim (fail "PersistentVolumeClaim `claim` is required") }}'
namespace: '{{.namespace | default "default"}}'
# yamllint disable rule:line-length
cmd: |
kubectl run -n {{.namespace}} debug-{{.claim}} -i --tty --rm --image=null --privileged --overrides='
{
"apiVersion": "v1",
"spec": {
"containers": [
{
"name": "debug",
"image": "public.ecr.aws/docker/library/alpine:edge",
"command": [
"/bin/sh",
"-c",
"echo 'http://dl-cdn.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories; apk update; /bin/sh"
],
"stdin": true,
"stdinOnce": true,
"tty": true,
"volumeMounts": [
{
"name": "config",
"mountPath": "/config"
}
]
}
],
"volumes": [
{
"name": "config",
"persistentVolumeClaim": {
"claimName": "{{.claim}}"
}
}
],
"restartPolicy": "Never"
}
}'
# yamllint enable rule:line-length
preconditions:
- kubectl -n {{.namespace}} get pvc {{.claim}}
'debug:talos':
desc: >-
Run privileged pod, with talos host mounted in /host and host networking enabled
[node=blossom namespace=tools]
interactive: true
vars:
node: '{{ or .node (fail "Node name `node` is required") }}'
namespace: '{{.namespace | default "tools"}}'
# yamllint disable rule:line-length
cmd: |
kubectl run -n {{.namespace}} debug-{{.node}} -i --tty --rm --image=null --privileged --overrides='
{
"apiVersion": "v1",
"spec": {
"nodeSelector": {
"kubernetes.io/hostname": "{{ .node }}"
},
"containers": [
{
"name": "debug",
"image": "public.ecr.aws/docker/library/alpine:edge",
"command": [
"/bin/sh",
"-c",
"echo 'http://dl-cdn.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories; apk update; /bin/sh"
],
"securityContext": {
"privileged": true
},
"stdin": true,
"stdinOnce": true,
"tty": true,
"volumeMounts": [
{
"name": "host",
"mountPath": "/host"
}
]
}
],
"hostNetwork": true,
"tolerations": [
{ "key": "node-role.kubernetes.io/control-plane", "operator": "Exists", "effect": "NoSchedule" },
{ "key": "node.kubernetes.io/unschedulable", "operator": "Exists", "effect": "NoSchedule" }
],
"volumes": [
{
"name": "host",
"hostPath": {
"path": "/",
"type": "Directory"
}
}
],
"restartPolicy": "Never"
}
}'
# yamllint enable rule:line-length
preconditions:
- kubectl get node {{.node}}