71
71
import java .io .Closeable ;
72
72
import java .io .IOException ;
73
73
import java .util .ArrayList ;
74
+ import java .util .Arrays ;
74
75
import java .util .Collections ;
75
76
import java .util .HashMap ;
76
77
import java .util .List ;
@@ -141,13 +142,49 @@ public class RemoteClusterStateService implements Closeable {
141
142
Setting .Property .NodeScope
142
143
);
143
144
144
- public static final Setting <Boolean > REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING = Setting .boolSetting (
145
- "cluster.remote_store.state.checksum_validation.enabled" ,
146
- false ,
147
- Property .Dynamic ,
148
- Property .NodeScope
145
+ public static final Setting <RemoteClusterStateValidationMode > REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING = new Setting <>(
146
+ "cluster.remote_store.state.checksum_validation.mode" ,
147
+ RemoteClusterStateValidationMode .NONE .name (),
148
+ RemoteClusterStateValidationMode ::parseString ,
149
+ Setting .Property .Dynamic ,
150
+ Setting .Property .NodeScope
149
151
);
150
152
153
+ /**
154
+ * Validation mode for cluster state checksum.
155
+ * None: Validation will be disabled.
156
+ * Debug: Validation enabled but only matches checksum and logs failing entities.
157
+ * Trace: Matches checksum and downloads full cluster state to find diff in failing entities. Only logs failures.
158
+ * Failure: Throws exception on failing validation.
159
+ */
160
+ public enum RemoteClusterStateValidationMode {
161
+ DEBUG ("debug" ),
162
+ TRACE ("trace" ),
163
+ FAILURE ("failure" ),
164
+ NONE ("none" );
165
+
166
+ public final String mode ;
167
+
168
+ RemoteClusterStateValidationMode (String mode ) {
169
+ this .mode = mode ;
170
+ }
171
+
172
+ public static RemoteClusterStateValidationMode parseString (String mode ) {
173
+ try {
174
+ return RemoteClusterStateValidationMode .valueOf (mode .toUpperCase (Locale .ROOT ));
175
+ } catch (IllegalArgumentException e ) {
176
+ throw new IllegalArgumentException (
177
+ "["
178
+ + mode
179
+ + "] mode is not supported. "
180
+ + "supported modes are ["
181
+ + Arrays .toString (RemoteClusterStateValidationMode .values ())
182
+ + "]"
183
+ );
184
+ }
185
+ }
186
+ }
187
+
151
188
private TimeValue remoteStateReadTimeout ;
152
189
private final String nodeId ;
153
190
private final Supplier <RepositoriesService > repositoriesService ;
@@ -159,7 +196,7 @@ public class RemoteClusterStateService implements Closeable {
159
196
private BlobStoreTransferService blobStoreTransferService ;
160
197
private RemoteRoutingTableService remoteRoutingTableService ;
161
198
private volatile TimeValue slowWriteLoggingThreshold ;
162
- private boolean checksumValidationEnabled ;
199
+ private RemoteClusterStateValidationMode remoteClusterStateValidationMode ;
163
200
164
201
private final RemotePersistenceStats remoteStateStats ;
165
202
private RemoteClusterStateCleanupManager remoteClusterStateCleanupManager ;
@@ -206,11 +243,8 @@ public RemoteClusterStateService(
206
243
clusterSettings .addSettingsUpdateConsumer (SLOW_WRITE_LOGGING_THRESHOLD , this ::setSlowWriteLoggingThreshold );
207
244
this .remoteStateReadTimeout = clusterSettings .get (REMOTE_STATE_READ_TIMEOUT_SETTING );
208
245
clusterSettings .addSettingsUpdateConsumer (REMOTE_STATE_READ_TIMEOUT_SETTING , this ::setRemoteStateReadTimeout );
209
- this .checksumValidationEnabled = clusterSettings .get (REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING );
210
- clusterSettings .addSettingsUpdateConsumer (
211
- REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING ,
212
- this ::setChecksumValidationEnabled
213
- );
246
+ this .remoteClusterStateValidationMode = REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING .get (settings );
247
+ clusterSettings .addSettingsUpdateConsumer (REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING , this ::setChecksumValidationMode );
214
248
215
249
this .remoteStateStats = new RemotePersistenceStats ();
216
250
this .namedWriteableRegistry = namedWriteableRegistry ;
@@ -272,7 +306,7 @@ public RemoteClusterStateManifestInfo writeFullMetadata(ClusterState clusterStat
272
306
uploadedMetadataResults ,
273
307
previousClusterUUID ,
274
308
clusterStateDiffManifest ,
275
- checksumValidationEnabled ? new ClusterStateChecksum (clusterState ) : null ,
309
+ ! remoteClusterStateValidationMode . equals ( RemoteClusterStateValidationMode . NONE ) ? new ClusterStateChecksum (clusterState ) : null ,
276
310
false ,
277
311
codecVersion
278
312
);
@@ -472,7 +506,7 @@ public RemoteClusterStateManifestInfo writeIncrementalMetadata(
472
506
uploadedMetadataResults ,
473
507
previousManifest .getPreviousClusterUUID (),
474
508
clusterStateDiffManifest ,
475
- checksumValidationEnabled ? new ClusterStateChecksum (clusterState ) : null ,
509
+ ! remoteClusterStateValidationMode . equals ( RemoteClusterStateValidationMode . NONE ) ? new ClusterStateChecksum (clusterState ) : null ,
476
510
false ,
477
511
previousManifest .getCodecVersion ()
478
512
);
@@ -917,7 +951,7 @@ public RemoteClusterStateManifestInfo markLastStateAsCommitted(ClusterState clus
917
951
uploadedMetadataResults ,
918
952
previousManifest .getPreviousClusterUUID (),
919
953
previousManifest .getDiffManifest (),
920
- checksumValidationEnabled ? previousManifest . getClusterStateChecksum ( ) : null ,
954
+ ! remoteClusterStateValidationMode . equals ( RemoteClusterStateValidationMode . NONE ) ? new ClusterStateChecksum ( clusterState ) : null ,
921
955
true ,
922
956
previousManifest .getCodecVersion ()
923
957
);
@@ -1003,8 +1037,8 @@ private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
1003
1037
this .slowWriteLoggingThreshold = slowWriteLoggingThreshold ;
1004
1038
}
1005
1039
1006
- private void setChecksumValidationEnabled ( Boolean checksumValidationEnabled ) {
1007
- this .checksumValidationEnabled = checksumValidationEnabled ;
1040
+ private void setChecksumValidationMode ( RemoteClusterStateValidationMode remoteClusterStateValidationMode ) {
1041
+ this .remoteClusterStateValidationMode = remoteClusterStateValidationMode ;
1008
1042
}
1009
1043
1010
1044
// Package private for unit test
@@ -1376,7 +1410,9 @@ public ClusterState getClusterStateForManifest(
1376
1410
includeEphemeral
1377
1411
);
1378
1412
1379
- if (includeEphemeral && checksumValidationEnabled && manifest .getClusterStateChecksum () != null ) {
1413
+ if (includeEphemeral
1414
+ && !remoteClusterStateValidationMode .equals (RemoteClusterStateValidationMode .NONE )
1415
+ && manifest .getClusterStateChecksum () != null ) {
1380
1416
validateClusterStateFromChecksum (manifest , clusterState , clusterName , localNodeId , true );
1381
1417
}
1382
1418
} else {
@@ -1498,7 +1534,7 @@ public ClusterState getClusterStateUsingDiff(ClusterMetadataManifest manifest, C
1498
1534
.routingTable (new RoutingTable (manifest .getRoutingTableVersion (), indexRoutingTables ))
1499
1535
.build ();
1500
1536
1501
- if (checksumValidationEnabled && manifest .getClusterStateChecksum () != null ) {
1537
+ if (! remoteClusterStateValidationMode . equals ( RemoteClusterStateValidationMode . NONE ) && manifest .getClusterStateChecksum () != null ) {
1502
1538
validateClusterStateFromChecksum (manifest , clusterState , previousState .getClusterName ().value (), localNodeId , false );
1503
1539
}
1504
1540
final long durationMillis = TimeValue .nsecToMSec (relativeTimeNanosSupplier .getAsLong () - startTimeNanos );
@@ -1517,20 +1553,24 @@ void validateClusterStateFromChecksum(
1517
1553
) {
1518
1554
ClusterStateChecksum newClusterStateChecksum = new ClusterStateChecksum (clusterState );
1519
1555
List <String > failedValidation = newClusterStateChecksum .getMismatchEntities (manifest .getClusterStateChecksum ());
1520
- if (!failedValidation .isEmpty ()) {
1521
- logger .error (
1522
- () -> new ParameterizedMessage (
1523
- "Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}" ,
1524
- manifest .getClusterStateChecksum (),
1525
- newClusterStateChecksum ,
1526
- failedValidation
1527
- )
1556
+ if (failedValidation .isEmpty ()) {
1557
+ return ;
1558
+ }
1559
+ logger .error (
1560
+ () -> new ParameterizedMessage (
1561
+ "Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}" ,
1562
+ manifest .getClusterStateChecksum (),
1563
+ newClusterStateChecksum ,
1564
+ failedValidation
1565
+ )
1566
+ );
1567
+ if (isFullStateDownload && remoteClusterStateValidationMode .equals (RemoteClusterStateValidationMode .FAILURE )) {
1568
+ throw new IllegalStateException (
1569
+ "Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
1528
1570
);
1529
- if (isFullStateDownload ) {
1530
- throw new IllegalStateException (
1531
- "Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
1532
- );
1533
- }
1571
+ }
1572
+ if (remoteClusterStateValidationMode .equals (RemoteClusterStateValidationMode .FAILURE )
1573
+ || remoteClusterStateValidationMode .equals (RemoteClusterStateValidationMode .TRACE )) {
1534
1574
// download full cluster state and match against state created for the failing entities
1535
1575
ClusterState fullClusterState = readClusterStateInParallel (
1536
1576
ClusterState .builder (new ClusterName (clusterName )).build (),
@@ -1663,6 +1703,8 @@ void validateClusterStateFromChecksum(
1663
1703
break ;
1664
1704
}
1665
1705
}
1706
+ }
1707
+ if (remoteClusterStateValidationMode .equals (RemoteClusterStateValidationMode .FAILURE )) {
1666
1708
throw new IllegalStateException (
1667
1709
"Cluster state checksums do not match during diff read. Validation failed for " + failedValidation
1668
1710
);
0 commit comments