Skip to content

Commit 12dadcf

Browse files
authored
Retry remote state download while bootstrap (opensearch-project#15950)
* Retry remote state download while bootstrap Signed-off-by: Sooraj Sinha <soosinha@amazon.com>
1 parent b3cc802 commit 12dadcf

File tree

3 files changed

+134
-14
lines changed

3 files changed

+134
-14
lines changed

server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.opensearch.test.OpenSearchIntegTestCase;
4343
import org.junit.Before;
4444

45+
import java.io.IOError;
4546
import java.io.IOException;
4647
import java.nio.file.Files;
4748
import java.nio.file.Path;
@@ -339,10 +340,11 @@ public void testFullClusterRestoreManifestFilePointsToInvalidIndexMetadataPathTh
339340
for (UploadedIndexMetadata md : manifest.getIndices()) {
340341
Files.move(segmentRepoPath.resolve(md.getUploadedFilename()), segmentRepoPath.resolve("cluster-state/"));
341342
}
343+
internalCluster().stopAllNodes();
342344
} catch (IOException e) {
343345
throw new RuntimeException(e);
344346
}
345-
assertThrows(IllegalStateException.class, () -> addNewNodes(dataNodeCount, clusterManagerNodeCount));
347+
assertThrows(IOError.class, () -> internalCluster().client());
346348
// Test is complete
347349

348350
// Starting a node without remote state to ensure test cleanup

server/src/main/java/org/opensearch/gateway/GatewayMetaState.java

+51-9
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,13 @@
6666
import org.opensearch.gateway.remote.RemoteClusterStateService;
6767
import org.opensearch.gateway.remote.model.RemoteClusterStateManifestInfo;
6868
import org.opensearch.index.recovery.RemoteStoreRestoreService;
69-
import org.opensearch.index.recovery.RemoteStoreRestoreService.RemoteRestoreResult;
7069
import org.opensearch.node.Node;
7170
import org.opensearch.plugins.MetadataUpgrader;
7271
import org.opensearch.threadpool.ThreadPool;
7372
import org.opensearch.transport.TransportService;
7473

7574
import java.io.Closeable;
75+
import java.io.IOError;
7676
import java.io.IOException;
7777
import java.io.UncheckedIOException;
7878
import java.util.Collections;
@@ -109,6 +109,8 @@ public class GatewayMetaState implements Closeable {
109109
*/
110110
public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG";
111111

112+
private final Logger logger = LogManager.getLogger(GatewayMetaState.class);
113+
112114
private PersistedStateRegistry persistedStateRegistry;
113115

114116
public PersistedState getPersistedState() {
@@ -175,15 +177,11 @@ public void start(
175177
);
176178
if (ClusterState.UNKNOWN_UUID.equals(lastKnownClusterUUID) == false) {
177179
// Load state from remote
178-
final RemoteRestoreResult remoteRestoreResult = remoteStoreRestoreService.restore(
179-
// Remote Metadata should always override local disk Metadata
180-
// if local disk Metadata's cluster uuid is UNKNOWN_UUID
181-
ClusterState.builder(clusterState).metadata(Metadata.EMPTY_METADATA).build(),
182-
lastKnownClusterUUID,
183-
false,
184-
new String[] {}
180+
clusterState = restoreClusterStateWithRetries(
181+
remoteStoreRestoreService,
182+
clusterState,
183+
lastKnownClusterUUID
185184
);
186-
clusterState = remoteRestoreResult.getClusterState();
187185
}
188186
}
189187
remotePersistedState = new RemotePersistedState(remoteClusterStateService, lastKnownClusterUUID);
@@ -258,6 +256,50 @@ public void start(
258256
}
259257
}
260258

259+
private ClusterState restoreClusterStateWithRetries(
260+
RemoteStoreRestoreService remoteStoreRestoreService,
261+
ClusterState clusterState,
262+
String lastKnownClusterUUID
263+
) {
264+
int maxAttempts = 5;
265+
int delayInMills = 200;
266+
for (int attempt = 1; attempt <= maxAttempts; attempt++) {
267+
try {
268+
logger.info("Attempt {} to restore cluster state", attempt);
269+
return restoreClusterState(remoteStoreRestoreService, clusterState, lastKnownClusterUUID);
270+
} catch (Exception e) {
271+
if (attempt == maxAttempts) {
272+
// Throw an Error so that the process is halted.
273+
throw new IOError(e);
274+
}
275+
try {
276+
TimeUnit.MILLISECONDS.sleep(delayInMills);
277+
} catch (InterruptedException ie) {
278+
Thread.currentThread().interrupt(); // Restore interrupted status
279+
throw new RuntimeException(ie);
280+
}
281+
delayInMills = delayInMills * 2;
282+
}
283+
}
284+
// This statement will never be reached.
285+
return null;
286+
}
287+
288+
ClusterState restoreClusterState(
289+
RemoteStoreRestoreService remoteStoreRestoreService,
290+
ClusterState clusterState,
291+
String lastKnownClusterUUID
292+
) {
293+
return remoteStoreRestoreService.restore(
294+
// Remote Metadata should always override local disk Metadata
295+
// if local disk Metadata's cluster uuid is UNKNOWN_UUID
296+
ClusterState.builder(clusterState).metadata(Metadata.EMPTY_METADATA).build(),
297+
lastKnownClusterUUID,
298+
false,
299+
new String[] {}
300+
).getClusterState();
301+
}
302+
261303
// exposed so it can be overridden by tests
262304
ClusterState prepareInitialClusterState(TransportService transportService, ClusterService clusterService, ClusterState clusterState) {
263305
assert clusterState.nodes().getLocalNode() == null : "prepareInitialClusterState must only be called once";

server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java

+80-4
Original file line numberDiff line numberDiff line change
@@ -1244,14 +1244,72 @@ public void testGatewayForRemoteStateForInitialBootstrapBlocksApplied() throws I
12441244
}
12451245
}
12461246

1247-
private MockGatewayMetaState newGatewayForRemoteState(
1247+
public void testGatewayMetaStateRemoteStateDownloadRetries() throws IOException {
1248+
MockGatewayMetaState gateway = null;
1249+
MockGatewayMetaState gatewayMetaStateSpy = null;
1250+
try {
1251+
RemoteClusterStateService remoteClusterStateService = mock(RemoteClusterStateService.class);
1252+
when(remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster")).thenReturn("test-cluster-uuid");
1253+
RemoteStoreRestoreService remoteStoreRestoreService = mock(RemoteStoreRestoreService.class);
1254+
when(remoteStoreRestoreService.restore(any(), any(), anyBoolean(), any())).thenThrow(
1255+
new IllegalStateException("unable to download cluster state")
1256+
).thenReturn(RemoteRestoreResult.build("test-cluster-uuid", null, ClusterState.EMPTY_STATE));
1257+
final PersistedStateRegistry persistedStateRegistry = persistedStateRegistry();
1258+
gateway = initializeGatewayForRemoteState(true);
1259+
gatewayMetaStateSpy = Mockito.spy(gateway);
1260+
startGatewayForRemoteState(
1261+
gatewayMetaStateSpy,
1262+
remoteClusterStateService,
1263+
remoteStoreRestoreService,
1264+
persistedStateRegistry,
1265+
ClusterState.EMPTY_STATE
1266+
);
1267+
verify(gatewayMetaStateSpy, times(2)).restoreClusterState(Mockito.any(), Mockito.any(), Mockito.any());
1268+
} finally {
1269+
IOUtils.close(gatewayMetaStateSpy);
1270+
}
1271+
}
1272+
1273+
public void testGatewayMetaStateRemoteStateDownloadFailure() throws IOException {
1274+
MockGatewayMetaState gateway = null;
1275+
final MockGatewayMetaState gatewayMetaStateSpy;
1276+
try {
1277+
RemoteClusterStateService remoteClusterStateService = mock(RemoteClusterStateService.class);
1278+
when(remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster")).thenReturn("test-cluster-uuid");
1279+
RemoteStoreRestoreService remoteStoreRestoreService = mock(RemoteStoreRestoreService.class);
1280+
when(remoteStoreRestoreService.restore(any(), any(), anyBoolean(), any())).thenThrow(
1281+
new IllegalStateException("unable to download cluster state")
1282+
);
1283+
final PersistedStateRegistry persistedStateRegistry = persistedStateRegistry();
1284+
gateway = initializeGatewayForRemoteState(true);
1285+
gatewayMetaStateSpy = Mockito.spy(gateway);
1286+
assertThrows(
1287+
Error.class,
1288+
() -> startGatewayForRemoteState(
1289+
gatewayMetaStateSpy,
1290+
remoteClusterStateService,
1291+
remoteStoreRestoreService,
1292+
persistedStateRegistry,
1293+
ClusterState.EMPTY_STATE
1294+
)
1295+
);
1296+
verify(gatewayMetaStateSpy, times(5)).restoreClusterState(Mockito.any(), Mockito.any(), Mockito.any());
1297+
} finally {
1298+
IOUtils.close(gateway);
1299+
}
1300+
}
1301+
1302+
private MockGatewayMetaState initializeGatewayForRemoteState(boolean prepareFullState) {
1303+
return new MockGatewayMetaState(localNode, bigArrays, prepareFullState);
1304+
}
1305+
1306+
private MockGatewayMetaState startGatewayForRemoteState(
1307+
MockGatewayMetaState gateway,
12481308
RemoteClusterStateService remoteClusterStateService,
12491309
RemoteStoreRestoreService remoteStoreRestoreService,
12501310
PersistedStateRegistry persistedStateRegistry,
1251-
ClusterState currentState,
1252-
boolean prepareFullState
1311+
ClusterState currentState
12531312
) throws IOException {
1254-
MockGatewayMetaState gateway = new MockGatewayMetaState(localNode, bigArrays, prepareFullState);
12551313
String randomRepoName = "randomRepoName";
12561314
String stateRepoTypeAttributeKey = String.format(
12571315
Locale.getDefault(),
@@ -1305,6 +1363,24 @@ private MockGatewayMetaState newGatewayForRemoteState(
13051363
return gateway;
13061364
}
13071365

1366+
private MockGatewayMetaState newGatewayForRemoteState(
1367+
RemoteClusterStateService remoteClusterStateService,
1368+
RemoteStoreRestoreService remoteStoreRestoreService,
1369+
PersistedStateRegistry persistedStateRegistry,
1370+
ClusterState currentState,
1371+
boolean prepareFullState
1372+
) throws IOException {
1373+
MockGatewayMetaState gatewayMetaState = initializeGatewayForRemoteState(prepareFullState);
1374+
startGatewayForRemoteState(
1375+
gatewayMetaState,
1376+
remoteClusterStateService,
1377+
remoteStoreRestoreService,
1378+
persistedStateRegistry,
1379+
currentState
1380+
);
1381+
return gatewayMetaState;
1382+
}
1383+
13081384
private static BigArrays getBigArrays() {
13091385
return usually()
13101386
? BigArrays.NON_RECYCLING_INSTANCE

0 commit comments

Comments
 (0)