Skip to content

Commit 153f978

Browse files
Changing checksum setting to support modes (#15622) (#15672)
* Changing checksum setting to support modes Signed-off-by: Himshikha Gupta <himshikh@amazon.com>
1 parent d2ae53e commit 153f978

File tree

5 files changed

+406
-43
lines changed

5 files changed

+406
-43
lines changed

server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteRoutingTableServiceIT.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
6767
)
6868
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, REMOTE_ROUTING_TABLE_REPO)
6969
.put(REMOTE_PUBLICATION_EXPERIMENTAL, true)
70-
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
70+
.put(
71+
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
72+
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
73+
)
7174
.build();
7275
}
7376

server/src/internalClusterTest/java/org/opensearch/gateway/remote/RemoteStatePublicationIT.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
9090
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName)
9191
.put(routingTableRepoTypeAttributeKey, ReloadableFsRepository.TYPE)
9292
.put(routingTableRepoSettingsAttributeKeyPrefix + "location", segmentRepoPath)
93-
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
93+
.put(
94+
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
95+
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
96+
)
9497
.build();
9598
}
9699

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ public void apply(Settings value, Settings current, Settings previous) {
743743
IndicesService.CLUSTER_INDEX_RESTRICT_REPLICATION_TYPE_SETTING,
744744
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_TYPE_SETTING,
745745
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING,
746-
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
746+
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING,
747747

748748
AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,
749749
CpuBasedAdmissionControllerSettings.CPU_BASED_ADMISSION_CONTROLLER_TRANSPORT_LAYER_MODE,

server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java

+73-31
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
import java.io.Closeable;
7272
import java.io.IOException;
7373
import java.util.ArrayList;
74+
import java.util.Arrays;
7475
import java.util.Collections;
7576
import java.util.HashMap;
7677
import java.util.List;
@@ -141,13 +142,49 @@ public class RemoteClusterStateService implements Closeable {
141142
Setting.Property.NodeScope
142143
);
143144

144-
public static final Setting<Boolean> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING = Setting.boolSetting(
145-
"cluster.remote_store.state.checksum_validation.enabled",
146-
false,
147-
Property.Dynamic,
148-
Property.NodeScope
145+
public static final Setting<RemoteClusterStateValidationMode> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING = new Setting<>(
146+
"cluster.remote_store.state.checksum_validation.mode",
147+
RemoteClusterStateValidationMode.NONE.name(),
148+
RemoteClusterStateValidationMode::parseString,
149+
Setting.Property.Dynamic,
150+
Setting.Property.NodeScope
149151
);
150152

153+
/**
154+
* Validation mode for cluster state checksum.
155+
* None: Validation will be disabled.
156+
* Debug: Validation enabled but only matches checksum and logs failing entities.
157+
* Trace: Matches checksum and downloads full cluster state to find diff in failing entities. Only logs failures.
158+
* Failure: Throws exception on failing validation.
159+
*/
160+
public enum RemoteClusterStateValidationMode {
161+
DEBUG("debug"),
162+
TRACE("trace"),
163+
FAILURE("failure"),
164+
NONE("none");
165+
166+
public final String mode;
167+
168+
RemoteClusterStateValidationMode(String mode) {
169+
this.mode = mode;
170+
}
171+
172+
public static RemoteClusterStateValidationMode parseString(String mode) {
173+
try {
174+
return RemoteClusterStateValidationMode.valueOf(mode.toUpperCase(Locale.ROOT));
175+
} catch (IllegalArgumentException e) {
176+
throw new IllegalArgumentException(
177+
"["
178+
+ mode
179+
+ "] mode is not supported. "
180+
+ "supported modes are ["
181+
+ Arrays.toString(RemoteClusterStateValidationMode.values())
182+
+ "]"
183+
);
184+
}
185+
}
186+
}
187+
151188
private TimeValue remoteStateReadTimeout;
152189
private final String nodeId;
153190
private final Supplier<RepositoriesService> repositoriesService;
@@ -159,7 +196,7 @@ public class RemoteClusterStateService implements Closeable {
159196
private BlobStoreTransferService blobStoreTransferService;
160197
private RemoteRoutingTableService remoteRoutingTableService;
161198
private volatile TimeValue slowWriteLoggingThreshold;
162-
private boolean checksumValidationEnabled;
199+
private RemoteClusterStateValidationMode remoteClusterStateValidationMode;
163200

164201
private final RemotePersistenceStats remoteStateStats;
165202
private RemoteClusterStateCleanupManager remoteClusterStateCleanupManager;
@@ -206,11 +243,8 @@ public RemoteClusterStateService(
206243
clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
207244
this.remoteStateReadTimeout = clusterSettings.get(REMOTE_STATE_READ_TIMEOUT_SETTING);
208245
clusterSettings.addSettingsUpdateConsumer(REMOTE_STATE_READ_TIMEOUT_SETTING, this::setRemoteStateReadTimeout);
209-
this.checksumValidationEnabled = clusterSettings.get(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING);
210-
clusterSettings.addSettingsUpdateConsumer(
211-
REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
212-
this::setChecksumValidationEnabled
213-
);
246+
this.remoteClusterStateValidationMode = REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.get(settings);
247+
clusterSettings.addSettingsUpdateConsumer(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING, this::setChecksumValidationMode);
214248

215249
this.remoteStateStats = new RemotePersistenceStats();
216250
this.namedWriteableRegistry = namedWriteableRegistry;
@@ -272,7 +306,7 @@ public RemoteClusterStateManifestInfo writeFullMetadata(ClusterState clusterStat
272306
uploadedMetadataResults,
273307
previousClusterUUID,
274308
clusterStateDiffManifest,
275-
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
309+
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
276310
false,
277311
codecVersion
278312
);
@@ -472,7 +506,7 @@ public RemoteClusterStateManifestInfo writeIncrementalMetadata(
472506
uploadedMetadataResults,
473507
previousManifest.getPreviousClusterUUID(),
474508
clusterStateDiffManifest,
475-
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
509+
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
476510
false,
477511
previousManifest.getCodecVersion()
478512
);
@@ -917,7 +951,7 @@ public RemoteClusterStateManifestInfo markLastStateAsCommitted(ClusterState clus
917951
uploadedMetadataResults,
918952
previousManifest.getPreviousClusterUUID(),
919953
previousManifest.getDiffManifest(),
920-
checksumValidationEnabled ? previousManifest.getClusterStateChecksum() : null,
954+
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
921955
true,
922956
previousManifest.getCodecVersion()
923957
);
@@ -1003,8 +1037,8 @@ private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
10031037
this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
10041038
}
10051039

1006-
private void setChecksumValidationEnabled(Boolean checksumValidationEnabled) {
1007-
this.checksumValidationEnabled = checksumValidationEnabled;
1040+
private void setChecksumValidationMode(RemoteClusterStateValidationMode remoteClusterStateValidationMode) {
1041+
this.remoteClusterStateValidationMode = remoteClusterStateValidationMode;
10081042
}
10091043

10101044
// Package private for unit test
@@ -1376,7 +1410,9 @@ public ClusterState getClusterStateForManifest(
13761410
includeEphemeral
13771411
);
13781412

1379-
if (includeEphemeral && checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
1413+
if (includeEphemeral
1414+
&& !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE)
1415+
&& manifest.getClusterStateChecksum() != null) {
13801416
validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true);
13811417
}
13821418
} else {
@@ -1498,7 +1534,7 @@ public ClusterState getClusterStateUsingDiff(ClusterMetadataManifest manifest, C
14981534
.routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables))
14991535
.build();
15001536

1501-
if (checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
1537+
if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) && manifest.getClusterStateChecksum() != null) {
15021538
validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false);
15031539
}
15041540
final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
@@ -1517,20 +1553,24 @@ void validateClusterStateFromChecksum(
15171553
) {
15181554
ClusterStateChecksum newClusterStateChecksum = new ClusterStateChecksum(clusterState);
15191555
List<String> failedValidation = newClusterStateChecksum.getMismatchEntities(manifest.getClusterStateChecksum());
1520-
if (!failedValidation.isEmpty()) {
1521-
logger.error(
1522-
() -> new ParameterizedMessage(
1523-
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
1524-
manifest.getClusterStateChecksum(),
1525-
newClusterStateChecksum,
1526-
failedValidation
1527-
)
1556+
if (failedValidation.isEmpty()) {
1557+
return;
1558+
}
1559+
logger.error(
1560+
() -> new ParameterizedMessage(
1561+
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
1562+
manifest.getClusterStateChecksum(),
1563+
newClusterStateChecksum,
1564+
failedValidation
1565+
)
1566+
);
1567+
if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
1568+
throw new IllegalStateException(
1569+
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
15281570
);
1529-
if (isFullStateDownload) {
1530-
throw new IllegalStateException(
1531-
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
1532-
);
1533-
}
1571+
}
1572+
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)
1573+
|| remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.TRACE)) {
15341574
// download full cluster state and match against state created for the failing entities
15351575
ClusterState fullClusterState = readClusterStateInParallel(
15361576
ClusterState.builder(new ClusterName(clusterName)).build(),
@@ -1663,6 +1703,8 @@ void validateClusterStateFromChecksum(
16631703
break;
16641704
}
16651705
}
1706+
}
1707+
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
16661708
throw new IllegalStateException(
16671709
"Cluster state checksums do not match during diff read. Validation failed for " + failedValidation
16681710
);

0 commit comments

Comments
 (0)