|
33 | 33 | package org.opensearch.discovery;
|
34 | 34 |
|
35 | 35 | import org.opensearch.cluster.ClusterState;
|
| 36 | +import org.opensearch.cluster.coordination.FailedToCommitClusterStateException; |
36 | 37 | import org.opensearch.cluster.coordination.JoinHelper;
|
| 38 | +import org.opensearch.cluster.coordination.PersistedStateRegistry; |
37 | 39 | import org.opensearch.cluster.coordination.PublicationTransportHandler;
|
| 40 | +import org.opensearch.cluster.metadata.RepositoriesMetadata; |
| 41 | +import org.opensearch.cluster.metadata.RepositoryMetadata; |
38 | 42 | import org.opensearch.cluster.node.DiscoveryNode;
|
39 | 43 | import org.opensearch.cluster.node.DiscoveryNodes;
|
40 | 44 | import org.opensearch.cluster.service.ClusterService;
|
| 45 | +import org.opensearch.common.Randomness; |
41 | 46 | import org.opensearch.common.settings.Settings;
|
| 47 | +import org.opensearch.repositories.RepositoriesService; |
| 48 | +import org.opensearch.repositories.Repository; |
| 49 | +import org.opensearch.repositories.RepositoryMissingException; |
| 50 | +import org.opensearch.repositories.fs.ReloadableFsRepository; |
42 | 51 | import org.opensearch.test.OpenSearchIntegTestCase;
|
43 | 52 | import org.opensearch.test.disruption.NetworkDisruption;
|
44 | 53 | import org.opensearch.test.disruption.ServiceDisruptionScheme;
|
45 | 54 | import org.opensearch.test.disruption.SlowClusterStateProcessing;
|
46 | 55 | import org.opensearch.test.transport.MockTransportService;
|
47 | 56 | import org.opensearch.transport.Transport;
|
48 | 57 | import org.opensearch.transport.TransportService;
|
| 58 | +import org.junit.Assert; |
49 | 59 |
|
| 60 | +import java.util.Arrays; |
50 | 61 | import java.util.HashSet;
|
| 62 | +import java.util.List; |
| 63 | +import java.util.Objects; |
51 | 64 | import java.util.Set;
|
52 | 65 | import java.util.concurrent.CountDownLatch;
|
| 66 | +import java.util.stream.Collectors; |
53 | 67 |
|
54 | 68 | import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING;
|
55 | 69 | import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING;
|
@@ -250,4 +264,142 @@ public void testNodeNotReachableFromClusterManager() throws Exception {
|
250 | 264 | ensureStableCluster(3);
|
251 | 265 | }
|
252 | 266 |
|
| 267 | + /** |
| 268 | + * Tests the scenario where-in a cluster-state containing new repository meta-data as part of a node-join from a |
| 269 | + * repository-configured node fails on a commit stag and has a master switch. This would lead to master nodes |
| 270 | + * doing another round of node-joins with the new cluster-state as the previous attempt had a successful publish. |
| 271 | + */ |
| 272 | + public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitFails() throws Exception { |
| 273 | + final String remoteStateRepoName = "remote-state-repo"; |
| 274 | + final String remoteRoutingTableRepoName = "routing-table-repo"; |
| 275 | + |
| 276 | + Settings remotePublicationSettings = buildRemotePublicationNodeAttributes( |
| 277 | + remoteStateRepoName, |
| 278 | + ReloadableFsRepository.TYPE, |
| 279 | + remoteRoutingTableRepoName, |
| 280 | + ReloadableFsRepository.TYPE |
| 281 | + ); |
| 282 | + internalCluster().startClusterManagerOnlyNodes(3); |
| 283 | + internalCluster().startDataOnlyNodes(3); |
| 284 | + |
| 285 | + String clusterManagerNode = internalCluster().getClusterManagerName(); |
| 286 | + List<String> nonClusterManagerNodes = Arrays.stream(internalCluster().getNodeNames()) |
| 287 | + .filter(node -> !node.equals(clusterManagerNode)) |
| 288 | + .collect(Collectors.toList()); |
| 289 | + |
| 290 | + ensureStableCluster(6); |
| 291 | + |
| 292 | + MockTransportService clusterManagerTransportService = (MockTransportService) internalCluster().getInstance( |
| 293 | + TransportService.class, |
| 294 | + clusterManagerNode |
| 295 | + ); |
| 296 | + logger.info("Blocking Cluster Manager Commit Request on all nodes"); |
| 297 | + // This is to allow the new node to have commit failures on the nodes in the send path itself. This will lead to the |
| 298 | + // nodes have a successful publish operation but failed commit operation. This will come into play once the new node joins |
| 299 | + nonClusterManagerNodes.forEach(node -> { |
| 300 | + TransportService targetTransportService = internalCluster().getInstance(TransportService.class, node); |
| 301 | + clusterManagerTransportService.addSendBehavior(targetTransportService, (connection, requestId, action, request, options) -> { |
| 302 | + if (action.equals(PublicationTransportHandler.COMMIT_STATE_ACTION_NAME)) { |
| 303 | + logger.info("--> preventing {} request", PublicationTransportHandler.COMMIT_STATE_ACTION_NAME); |
| 304 | + throw new FailedToCommitClusterStateException("Blocking Commit"); |
| 305 | + } |
| 306 | + connection.sendRequest(requestId, action, request, options); |
| 307 | + }); |
| 308 | + }); |
| 309 | + |
| 310 | + logger.info("Starting Node with remote publication settings"); |
| 311 | + // Start a node with remote-publication repositories configured. This will lead to the active cluster-manager create |
| 312 | + // a new cluster-state event with the new node-join along with new repositories setup in the cluster meta-data. |
| 313 | + internalCluster().startDataOnlyNodes(1, remotePublicationSettings, Boolean.TRUE); |
| 314 | + |
| 315 | + // Checking if publish succeeded in the nodes before shutting down the blocked cluster-manager |
| 316 | + assertBusy(() -> { |
| 317 | + String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); |
| 318 | + PersistedStateRegistry registry = internalCluster().getInstance(PersistedStateRegistry.class, randomNode); |
| 319 | + |
| 320 | + ClusterState state = registry.getPersistedState(PersistedStateRegistry.PersistedStateType.LOCAL).getLastAcceptedState(); |
| 321 | + RepositoriesMetadata repositoriesMetadata = state.metadata().custom(RepositoriesMetadata.TYPE); |
| 322 | + Boolean isRemoteStateRepoConfigured = Boolean.FALSE; |
| 323 | + Boolean isRemoteRoutingTableRepoConfigured = Boolean.FALSE; |
| 324 | + |
| 325 | + assertNotNull(repositoriesMetadata); |
| 326 | + assertNotNull(repositoriesMetadata.repositories()); |
| 327 | + |
| 328 | + for (RepositoryMetadata repo : repositoriesMetadata.repositories()) { |
| 329 | + if (repo.name().equals(remoteStateRepoName)) { |
| 330 | + isRemoteStateRepoConfigured = Boolean.TRUE; |
| 331 | + } else if (repo.name().equals(remoteRoutingTableRepoName)) { |
| 332 | + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; |
| 333 | + } |
| 334 | + } |
| 335 | + // Asserting that the metadata is present in the persisted cluster-state |
| 336 | + assertTrue(isRemoteStateRepoConfigured); |
| 337 | + assertTrue(isRemoteRoutingTableRepoConfigured); |
| 338 | + |
| 339 | + RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); |
| 340 | + |
| 341 | + isRemoteStateRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteStateRepoName); |
| 342 | + isRemoteRoutingTableRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteRoutingTableRepoName); |
| 343 | + |
| 344 | + // Asserting that the metadata is not present in the repository service. |
| 345 | + Assert.assertFalse(isRemoteStateRepoConfigured); |
| 346 | + Assert.assertFalse(isRemoteRoutingTableRepoConfigured); |
| 347 | + }); |
| 348 | + |
| 349 | + logger.info("Stopping current Cluster Manager"); |
| 350 | + // We stop the current cluster-manager whose outbound paths were blocked. This is to force a new election onto nodes |
| 351 | + // we had the new cluster-state published but not commited. |
| 352 | + internalCluster().stopCurrentClusterManagerNode(); |
| 353 | + |
| 354 | + // We expect that the repositories validations are skipped in this case and node-joins succeeds as expected. The |
| 355 | + // repositories validations are skipped because even though the cluster-state is updated in the persisted registry, |
| 356 | + // the repository service will not be updated as the commit attempt failed. |
| 357 | + ensureStableCluster(6); |
| 358 | + |
| 359 | + String randomNode = nonClusterManagerNodes.get(Randomness.get().nextInt(nonClusterManagerNodes.size())); |
| 360 | + |
| 361 | + // Checking if the final cluster-state is updated. |
| 362 | + RepositoriesMetadata repositoriesMetadata = internalCluster().getInstance(ClusterService.class, randomNode) |
| 363 | + .state() |
| 364 | + .metadata() |
| 365 | + .custom(RepositoriesMetadata.TYPE); |
| 366 | + |
| 367 | + Boolean isRemoteStateRepoConfigured = Boolean.FALSE; |
| 368 | + Boolean isRemoteRoutingTableRepoConfigured = Boolean.FALSE; |
| 369 | + |
| 370 | + for (RepositoryMetadata repo : repositoriesMetadata.repositories()) { |
| 371 | + if (repo.name().equals(remoteStateRepoName)) { |
| 372 | + isRemoteStateRepoConfigured = Boolean.TRUE; |
| 373 | + } else if (repo.name().equals(remoteRoutingTableRepoName)) { |
| 374 | + isRemoteRoutingTableRepoConfigured = Boolean.TRUE; |
| 375 | + } |
| 376 | + } |
| 377 | + |
| 378 | + Assert.assertTrue("RemoteState Repo is not set in RepositoriesMetadata", isRemoteStateRepoConfigured); |
| 379 | + Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoriesMetadata", isRemoteRoutingTableRepoConfigured); |
| 380 | + |
| 381 | + RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, randomNode); |
| 382 | + |
| 383 | + isRemoteStateRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteStateRepoName); |
| 384 | + isRemoteRoutingTableRepoConfigured = isRepoPresentInRepositoryService(repositoriesService, remoteRoutingTableRepoName); |
| 385 | + |
| 386 | + Assert.assertTrue("RemoteState Repo is not set in RepositoryService", isRemoteStateRepoConfigured); |
| 387 | + Assert.assertTrue("RemoteRoutingTable Repo is not set in RepositoryService", isRemoteRoutingTableRepoConfigured); |
| 388 | + |
| 389 | + logger.info("Stopping current Cluster Manager"); |
| 390 | + } |
| 391 | + |
| 392 | + private Boolean isRepoPresentInRepositoryService(RepositoriesService repositoriesService, String repoName) { |
| 393 | + try { |
| 394 | + Repository remoteStateRepo = repositoriesService.repository(repoName); |
| 395 | + if (Objects.nonNull(remoteStateRepo)) { |
| 396 | + return Boolean.TRUE; |
| 397 | + } |
| 398 | + } catch (RepositoryMissingException e) { |
| 399 | + return Boolean.FALSE; |
| 400 | + } |
| 401 | + |
| 402 | + return Boolean.FALSE; |
| 403 | + } |
| 404 | + |
253 | 405 | }
|
0 commit comments