Skip to content

Commit 87dc9d2

Browse files
committedOct 26, 2023
enable retry based on labels
Fixes #150 - handle retry when already in delete stages - account for new generation and deleted
1 parent ec174bd commit 87dc9d2

File tree

5 files changed

+115
-15
lines changed

5 files changed

+115
-15
lines changed
 

‎deploy/crds/tf.galleybytes.com_terraforms_crd.yaml

+13-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ apiVersion: apiextensions.k8s.io/v1
33
kind: CustomResourceDefinition
44
metadata:
55
annotations:
6-
controller-gen.kubebuilder.io/version: v0.12.0
6+
controller-gen.kubebuilder.io/version: v0.9.2
7+
creationTimestamp: null
78
name: terraforms.tf.galleybytes.com
89
spec:
910
group: tf.galleybytes.com
@@ -2563,6 +2564,17 @@ spec:
25632564
it, the chance of recycling existing resources is reduced to virtually
25642565
nil.
25652566
type: string
2567+
retryEventReson:
2568+
description: "RetryEventReason copies the value of the resource label
2569+
for 'kubernetes.io/change-cause'. When '.setup' is is the suffix
2570+
of the value, the pipeline will retry from the setup task. \n Example
2571+
of starting from setup: \n ```yaml metadata: labels: kubernetes.io/change-cause:
2572+
triggered-by-isa_aguilar-20231025T011600.setup ``` \n A default
2573+
retry will start from the init task otherwise."
2574+
type: string
2575+
retryTimestamp:
2576+
format: date-time
2577+
type: string
25662578
stage:
25672579
description: Stage stores information about the current stage
25682580
properties:

‎pkg/apis/tf/v1beta1/terraform_types.go

+15
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,21 @@ type TerraformStatus struct {
675675
// refreshed each generation.
676676
// +optional
677677
PluginsStarted []TaskName `json:"pluginsStarted,omitempty"`
678+
679+
// RetryEventReason copies the value of the resource label for 'kubernetes.io/change-cause'.
680+
// When '.setup' is is the suffix of the value, the pipeline will retry from the setup task.
681+
//
682+
// Example of starting from setup:
683+
//
684+
// ```yaml
685+
// metadata:
686+
// labels:
687+
// kubernetes.io/change-cause: triggered-by-isa_aguilar-20231025T011600.setup
688+
// ```
689+
//
690+
// A default retry will start from the init task otherwise.
691+
RetryEventReason *string `json:"retryEventReson,omitempty"`
692+
RetryTimestamp *metav1.Time `json:"retryTimestamp,omitempty"`
678693
}
679694

680695
type Exported string

‎pkg/apis/tf/v1beta1/zz_generated.deepcopy.go

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pkg/apis/tf/v1beta1/zz_generated.openapi.go

+13-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pkg/controllers/terraform_controller.go

+65-13
Original file line numberDiff line numberDiff line change
@@ -670,14 +670,41 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
670670
//
671671
// }
672672
// }
673-
stage := r.checkSetNewStage(ctx, tf)
673+
674+
retry := false
675+
if tf.Labels != nil {
676+
if label, found := tf.Labels["kubernetes.io/change-cause"]; found {
677+
678+
if tf.Status.RetryEventReason == nil {
679+
retry = true
680+
} else if *tf.Status.RetryEventReason != label {
681+
retry = true
682+
}
683+
684+
if retry {
685+
// Once a single retry is triggered via the change-cause label method,
686+
// the retry* status entries will persist for the lifetime of
687+
// the resource. This doesn't affect workflows, but it's a little annoying to see the
688+
// status long after the retry has occurred. In the future, see if there is a way to clean
689+
// up the status.
690+
// As of today, attempting to clean the retry* status when the change-cause label still exists
691+
// causes the controller to skip new generation steps like creating configmaps, secrets, etc.
692+
// TODO clean retry* status
693+
now := metav1.Now()
694+
tf.Status.RetryEventReason = &label // saved via updateStatusWithRetry
695+
tf.Status.RetryTimestamp = &now // saved via updateStatusWithRetry
696+
}
697+
}
698+
}
699+
700+
stage := r.checkSetNewStage(ctx, tf, retry)
674701
if stage != nil {
675-
tf.Status.Stage = *stage
676702
if stage.Reason == "RESTARTED_WORKFLOW" || stage.Reason == "RESTARTED_DELETE_WORKFLOW" {
677-
_ = r.removeOldPlan(tf)
703+
_ = r.removeOldPlan(tf.Namespace, tf.Name, tf.Status.Stage.Reason, tf.Generation)
678704
// TODO what to do if the remove old plan function fails
679705
}
680706
reqLogger.V(2).Info(fmt.Sprintf("Stage moving from '%s' -> '%s'", tf.Status.Stage.TaskType, stage.TaskType))
707+
tf.Status.Stage = *stage
681708
desiredStatus := tf.Status
682709
err := r.updateStatusWithRetry(ctx, tf, &desiredStatus, reqLogger)
683710
if err != nil {
@@ -765,6 +792,16 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
765792
return reconcile.Result{}, nil
766793
}
767794

795+
if tf.Status.RetryTimestamp != nil {
796+
podSlice := []corev1.Pod{}
797+
for _, pod := range pods.Items {
798+
if pod.CreationTimestamp.IsZero() || !pod.CreationTimestamp.Before(tf.Status.RetryTimestamp) {
799+
podSlice = append(podSlice, pod)
800+
}
801+
}
802+
pods.Items = podSlice
803+
}
804+
768805
if len(pods.Items) == 0 && tf.Status.Stage.State == tfv1beta1.StateInProgress {
769806
// This condition is generally met when the user deletes the pod.
770807
// Force the state to transition away from in-progress and then
@@ -851,7 +888,7 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
851888
reqLogger.V(1).Info(fmt.Sprintf("Setting up the '%s' pod", podType))
852889
err := r.setupAndRun(ctx, tf, runOpts)
853890
if err != nil {
854-
reqLogger.Error(err, "")
891+
reqLogger.Error(err, err.Error())
855892
return reconcile.Result{}, err
856893
}
857894
if tf.Status.Phase == tfv1beta1.PhaseInitializing {
@@ -1030,7 +1067,7 @@ func getConfiguredTasks(taskOptions *[]tfv1beta1.TaskOption) []tfv1beta1.TaskNam
10301067
// When a stage has already triggered a pod, the only way for the pod to transition to the next stage is for
10311068
// the pod to complete successfully. Any other pod phase will keep the pod in the current stage, or in the
10321069
// case of the apply task, the workflow will be restarted.
1033-
func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.Terraform) *tfv1beta1.Stage {
1070+
func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.Terraform, isRetry bool) *tfv1beta1.Stage {
10341071
var isNewStage bool
10351072
var podType tfv1beta1.TaskName
10361073
var reason string
@@ -1052,8 +1089,23 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.
10521089
currentStageIsRunning := currentStage.State == tfv1beta1.StateInProgress
10531090
isNewGeneration := currentStage.Generation != tf.Generation
10541091

1055-
// resource status
1056-
if currentStageCanNotBeInterrupted && currentStageIsRunning {
1092+
if isRetry && !isToBeDeletedOrIsDeleting && !isNewGeneration {
1093+
isNewStage = true
1094+
reason = *tf.Status.RetryEventReason
1095+
podType = tfv1beta1.RunInit
1096+
if strings.HasSuffix(reason, ".setup") {
1097+
podType = tfv1beta1.RunSetup
1098+
}
1099+
interruptible = isTaskInterruptable(podType)
1100+
} else if isRetry && isToBeDeletedOrIsDeleting && !isNewGeneration {
1101+
isNewStage = true
1102+
reason = *tf.Status.RetryEventReason
1103+
podType = tfv1beta1.RunInitDelete
1104+
if strings.HasSuffix(reason, ".setup") {
1105+
podType = tfv1beta1.RunSetupDelete
1106+
}
1107+
interruptible = isTaskInterruptable(podType)
1108+
} else if currentStageCanNotBeInterrupted && currentStageIsRunning {
10571109
// Cannot change to the next stage because the current stage cannot be
10581110
// interrupted and is currently running
10591111
isNewStage = false
@@ -1125,20 +1177,20 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.
11251177

11261178
}
11271179

1128-
func (r ReconcileTerraform) removeOldPlan(tf *tfv1beta1.Terraform) error {
1180+
func (r ReconcileTerraform) removeOldPlan(namespace, name, reason string, generation int64) error {
11291181
labelSelectors := []string{
1130-
fmt.Sprintf("terraforms.tf.galleybytes.com/generation==%d", tf.Generation),
1131-
fmt.Sprintf("terraforms.tf.galleybytes.com/resourceName=%s", tf.Name),
1182+
fmt.Sprintf("terraforms.tf.galleybytes.com/generation==%d", generation),
1183+
fmt.Sprintf("terraforms.tf.galleybytes.com/resourceName=%s", name),
11321184
"app.kubernetes.io/instance",
11331185
}
1134-
if tf.Status.Stage.Reason == "RESTARTED_WORKFLOW" {
1186+
if reason == "RESTARTED_WORKFLOW" {
11351187
labelSelectors = append(labelSelectors, []string{
11361188
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunSetup),
11371189
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPreInit),
11381190
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunInit),
11391191
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPostInit),
11401192
}...)
1141-
} else if tf.Status.Stage.Reason == "RESTARTED_DELETE_WORKFLOW" {
1193+
} else if reason == "RESTARTED_DELETE_WORKFLOW" {
11421194
labelSelectors = append(labelSelectors, []string{
11431195
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunSetupDelete),
11441196
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPreInitDelete),
@@ -1157,7 +1209,7 @@ func (r ReconcileTerraform) removeOldPlan(tf *tfv1beta1.Terraform) error {
11571209
err = r.Client.DeleteAllOf(context.TODO(), &corev1.Pod{}, &client.DeleteAllOfOptions{
11581210
ListOptions: client.ListOptions{
11591211
LabelSelector: labelSelector,
1160-
Namespace: tf.Namespace,
1212+
Namespace: namespace,
11611213
FieldSelector: fieldSelector,
11621214
},
11631215
})

0 commit comments

Comments
 (0)
Please sign in to comment.