@@ -670,14 +670,41 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
670
670
//
671
671
// }
672
672
// }
673
- stage := r .checkSetNewStage (ctx , tf )
673
+
674
+ retry := false
675
+ if tf .Labels != nil {
676
+ if label , found := tf .Labels ["kubernetes.io/change-cause" ]; found {
677
+
678
+ if tf .Status .RetryEventReason == nil {
679
+ retry = true
680
+ } else if * tf .Status .RetryEventReason != label {
681
+ retry = true
682
+ }
683
+
684
+ if retry {
685
+ // Once a single retry is triggered via the change-cause label method,
686
+ // the retry* status entries will persist for the lifetime of
687
+ // the resource. This doesn't affect workflows, but it's a little annoying to see the
688
+ // status long after the retry has occurred. In the future, see if there is a way to clean
689
+ // up the status.
690
+ // As of today, attempting to clean the retry* status when the change-cause label still exists
691
+ // causes the controller to skip new generation steps like creating configmaps, secrets, etc.
692
+ // TODO clean retry* status
693
+ now := metav1 .Now ()
694
+ tf .Status .RetryEventReason = & label // saved via updateStatusWithRetry
695
+ tf .Status .RetryTimestamp = & now // saved via updateStatusWithRetry
696
+ }
697
+ }
698
+ }
699
+
700
+ stage := r .checkSetNewStage (ctx , tf , retry )
674
701
if stage != nil {
675
- tf .Status .Stage = * stage
676
702
if stage .Reason == "RESTARTED_WORKFLOW" || stage .Reason == "RESTARTED_DELETE_WORKFLOW" {
677
- _ = r .removeOldPlan (tf )
703
+ _ = r .removeOldPlan (tf . Namespace , tf . Name , tf . Status . Stage . Reason , tf . Generation )
678
704
// TODO what to do if the remove old plan function fails
679
705
}
680
706
reqLogger .V (2 ).Info (fmt .Sprintf ("Stage moving from '%s' -> '%s'" , tf .Status .Stage .TaskType , stage .TaskType ))
707
+ tf .Status .Stage = * stage
681
708
desiredStatus := tf .Status
682
709
err := r .updateStatusWithRetry (ctx , tf , & desiredStatus , reqLogger )
683
710
if err != nil {
@@ -765,6 +792,16 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
765
792
return reconcile.Result {}, nil
766
793
}
767
794
795
+ if tf .Status .RetryTimestamp != nil {
796
+ podSlice := []corev1.Pod {}
797
+ for _ , pod := range pods .Items {
798
+ if pod .CreationTimestamp .IsZero () || ! pod .CreationTimestamp .Before (tf .Status .RetryTimestamp ) {
799
+ podSlice = append (podSlice , pod )
800
+ }
801
+ }
802
+ pods .Items = podSlice
803
+ }
804
+
768
805
if len (pods .Items ) == 0 && tf .Status .Stage .State == tfv1beta1 .StateInProgress {
769
806
// This condition is generally met when the user deletes the pod.
770
807
// Force the state to transition away from in-progress and then
@@ -851,7 +888,7 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
851
888
reqLogger .V (1 ).Info (fmt .Sprintf ("Setting up the '%s' pod" , podType ))
852
889
err := r .setupAndRun (ctx , tf , runOpts )
853
890
if err != nil {
854
- reqLogger .Error (err , "" )
891
+ reqLogger .Error (err , err . Error () )
855
892
return reconcile.Result {}, err
856
893
}
857
894
if tf .Status .Phase == tfv1beta1 .PhaseInitializing {
@@ -1030,7 +1067,7 @@ func getConfiguredTasks(taskOptions *[]tfv1beta1.TaskOption) []tfv1beta1.TaskNam
1030
1067
// When a stage has already triggered a pod, the only way for the pod to transition to the next stage is for
1031
1068
// the pod to complete successfully. Any other pod phase will keep the pod in the current stage, or in the
1032
1069
// case of the apply task, the workflow will be restarted.
1033
- func (r ReconcileTerraform ) checkSetNewStage (ctx context.Context , tf * tfv1beta1.Terraform ) * tfv1beta1.Stage {
1070
+ func (r ReconcileTerraform ) checkSetNewStage (ctx context.Context , tf * tfv1beta1.Terraform , isRetry bool ) * tfv1beta1.Stage {
1034
1071
var isNewStage bool
1035
1072
var podType tfv1beta1.TaskName
1036
1073
var reason string
@@ -1052,8 +1089,23 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.
1052
1089
currentStageIsRunning := currentStage .State == tfv1beta1 .StateInProgress
1053
1090
isNewGeneration := currentStage .Generation != tf .Generation
1054
1091
1055
- // resource status
1056
- if currentStageCanNotBeInterrupted && currentStageIsRunning {
1092
+ if isRetry && ! isToBeDeletedOrIsDeleting && ! isNewGeneration {
1093
+ isNewStage = true
1094
+ reason = * tf .Status .RetryEventReason
1095
+ podType = tfv1beta1 .RunInit
1096
+ if strings .HasSuffix (reason , ".setup" ) {
1097
+ podType = tfv1beta1 .RunSetup
1098
+ }
1099
+ interruptible = isTaskInterruptable (podType )
1100
+ } else if isRetry && isToBeDeletedOrIsDeleting && ! isNewGeneration {
1101
+ isNewStage = true
1102
+ reason = * tf .Status .RetryEventReason
1103
+ podType = tfv1beta1 .RunInitDelete
1104
+ if strings .HasSuffix (reason , ".setup" ) {
1105
+ podType = tfv1beta1 .RunSetupDelete
1106
+ }
1107
+ interruptible = isTaskInterruptable (podType )
1108
+ } else if currentStageCanNotBeInterrupted && currentStageIsRunning {
1057
1109
// Cannot change to the next stage because the current stage cannot be
1058
1110
// interrupted and is currently running
1059
1111
isNewStage = false
@@ -1125,20 +1177,20 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.
1125
1177
1126
1178
}
1127
1179
1128
- func (r ReconcileTerraform ) removeOldPlan (tf * tfv1beta1. Terraform ) error {
1180
+ func (r ReconcileTerraform ) removeOldPlan (namespace , name , reason string , generation int64 ) error {
1129
1181
labelSelectors := []string {
1130
- fmt .Sprintf ("terraforms.tf.galleybytes.com/generation==%d" , tf . Generation ),
1131
- fmt .Sprintf ("terraforms.tf.galleybytes.com/resourceName=%s" , tf . Name ),
1182
+ fmt .Sprintf ("terraforms.tf.galleybytes.com/generation==%d" , generation ),
1183
+ fmt .Sprintf ("terraforms.tf.galleybytes.com/resourceName=%s" , name ),
1132
1184
"app.kubernetes.io/instance" ,
1133
1185
}
1134
- if tf . Status . Stage . Reason == "RESTARTED_WORKFLOW" {
1186
+ if reason == "RESTARTED_WORKFLOW" {
1135
1187
labelSelectors = append (labelSelectors , []string {
1136
1188
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunSetup ),
1137
1189
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunPreInit ),
1138
1190
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunInit ),
1139
1191
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunPostInit ),
1140
1192
}... )
1141
- } else if tf . Status . Stage . Reason == "RESTARTED_DELETE_WORKFLOW" {
1193
+ } else if reason == "RESTARTED_DELETE_WORKFLOW" {
1142
1194
labelSelectors = append (labelSelectors , []string {
1143
1195
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunSetupDelete ),
1144
1196
fmt .Sprintf ("app.kubernetes.io/instance!=%s" , tfv1beta1 .RunPreInitDelete ),
@@ -1157,7 +1209,7 @@ func (r ReconcileTerraform) removeOldPlan(tf *tfv1beta1.Terraform) error {
1157
1209
err = r .Client .DeleteAllOf (context .TODO (), & corev1.Pod {}, & client.DeleteAllOfOptions {
1158
1210
ListOptions : client.ListOptions {
1159
1211
LabelSelector : labelSelector ,
1160
- Namespace : tf . Namespace ,
1212
+ Namespace : namespace ,
1161
1213
FieldSelector : fieldSelector ,
1162
1214
},
1163
1215
})
0 commit comments