WIP

Jay Deng · Jay Deng · commit 4a71a7ab23a7 · 2025-02-24T12:06:52.000-08:00
diff --git a/src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategy.java b/src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategy.java
@@ -5,8 +5,10 @@
 
 package org.opensearch.knn.index.codec.nativeindex.remote;
 
+import com.google.common.annotations.VisibleForTesting;
 import lombok.extern.log4j.Log4j2;
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.lucene.store.IndexOutput;
 import org.opensearch.action.LatchedActionListener;
 import org.opensearch.common.CheckedTriFunction;
 import org.opensearch.common.StopWatch;
@@ -25,6 +27,7 @@
 import org.opensearch.knn.index.VectorDataType;
 import org.opensearch.knn.index.codec.nativeindex.NativeIndexBuildStrategy;
 import org.opensearch.knn.index.codec.nativeindex.model.BuildIndexParams;
+import org.opensearch.knn.index.engine.KNNEngine;
 import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
 import org.opensearch.repositories.RepositoriesService;
 import org.opensearch.repositories.Repository;
@@ -143,7 +146,8 @@ public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException {
             log.debug("Await vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
 
             stopWatch = new StopWatch().start();
-            readFromRepository();
+            // TODO: This blob will be retrieved from the remote vector build service status response
+            readFromRepository(blobName + KNNEngine.FAISS.getExtension(), indexInfo.getIndexOutputWithBuffer().getIndexOutput());
             time_in_millis = stopWatch.stop().totalTime().millis();
             log.debug("Repository read took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName());
         } catch (Exception e) {
@@ -171,6 +175,14 @@ private BlobStoreRepository getRepository() throws RepositoryMissingException {
         return (BlobStoreRepository) repository;
     }
 
+    /**
+     * @return The blob container to read/write from, determined from the repository base path and index settings. This container is where all blobs will be written to.
+     */
+    private BlobContainer getBlobContainer() {
+        BlobPath path = getRepository().basePath().add(indexSettings.getUUID() + VECTORS_PATH);
+        return getRepository().blobStore().blobContainer(path);
+    }
+
     /**
      * This method is responsible for writing both the vector blobs and doc ids provided by {@param knnVectorValuesSupplier} to the vector repository configured by {@link KNN_REMOTE_VECTOR_REPO_SETTING}.
      * If the repository implements {@link AsyncMultiStreamBlobContainer}, then parallel uploads will be used. Parallel uploads are backed by a {@link WriteContext}, for which we have a custom
@@ -192,9 +204,7 @@ private void writeToRepository(
         VectorDataType vectorDataType,
         Supplier<KNNVectorValues<?>> knnVectorValuesSupplier
     ) throws IOException, InterruptedException {
-        // Get the blob container based on blobName and the repo base path. This is where the blobs will be written to.
-        BlobPath path = getRepository().basePath().add(indexSettings.getUUID() + VECTORS_PATH);
-        BlobContainer blobContainer = getRepository().blobStore().blobContainer(path);
+        BlobContainer blobContainer = getBlobContainer();
 
         KNNVectorValues<?> knnVectorValues = knnVectorValuesSupplier.get();
         initializeVectorValues(knnVectorValues);
@@ -343,7 +353,27 @@ private void awaitVectorBuild() {
     /**
      * Read constructed vector file from remote repository and write to IndexOutput
      */
-    private void readFromRepository() {
-        throw new NotImplementedException();
+    @VisibleForTesting
+    void readFromRepository(String blobName, IndexOutput indexOutput) throws IOException {
+        BlobContainer blobContainer = getBlobContainer();
+        // TODO: We are using the sequential download API as multi-part parallel download is difficult for us to implement today and
+        // requires some changes in core. For more details, see: https://github.com/opensearch-project/k-NN/issues/2464
+        InputStream graphStream = blobContainer.readBlob(blobName);
+
+        // Allocate buffer of 64KB, same as used for CPU builds, see: IndexOutputWithBuffer
+        int CHUNK_SIZE = 64 * 1024;
+        byte[] buffer = new byte[CHUNK_SIZE];
+
+        int bytesRead = 0;
+        // InputStream uses -1 indicates there are no more bytes to be read
+        while (bytesRead != -1) {
+            // Try to read CHUNK_SIZE into the buffer. The actual amount read may be less.
+            bytesRead = graphStream.read(buffer, 0, CHUNK_SIZE);
+            assert bytesRead <= CHUNK_SIZE;
+            // However many bytes we read, write it to the IndexOutput if != -1
+            if (bytesRead != -1) {
+                indexOutput.writeBytes(buffer, 0, bytesRead);
+            }
+        }
     }
 }
diff --git a/src/main/java/org/opensearch/knn/index/store/IndexOutputWithBuffer.java b/src/main/java/org/opensearch/knn/index/store/IndexOutputWithBuffer.java
@@ -5,13 +5,16 @@
 
 package org.opensearch.knn.index.store;
 
+import lombok.Getter;
 import org.apache.lucene.store.IndexOutput;
 
 import java.io.IOException;
 
 public class IndexOutputWithBuffer {
+    // Getting is exposed for RemoteIndexBuildStrategy to write to the IndexOutput.
+    @Getter
     // Underlying `IndexOutput` obtained from Lucene's Directory.
-    private IndexOutput indexOutput;
+    private final IndexOutput indexOutput;
     // Write buffer. Native engine will copy bytes into this buffer.
     // Allocating 64KB here since it show better performance in NMSLIB with the size. (We had slightly improvement in FAISS than having 4KB)
     // NMSLIB writes an adjacent list size first, then followed by serializing the list. Since we usually have more adjacent lists, having
diff --git a/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategyTests.java b/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildStrategyTests.java
@@ -12,6 +12,8 @@
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.InfoStream;
 import org.apache.lucene.util.Version;
 import org.junit.Before;
@@ -42,11 +44,13 @@
 import org.opensearch.repositories.RepositoryMissingException;
 import org.opensearch.repositories.blobstore.BlobStoreRepository;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Path;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.Mockito.mock;
@@ -208,4 +212,54 @@ public void testRepositoryInteraction() throws IOException {
         verify(mockBlobStore).blobContainer(any());
         verify(mockRepository).basePath();
     }
+
+    /**
+     * Verify the buffered read method in {@link RemoteIndexBuildStrategy#readFromRepository} produces the correct result
+     */
+    public void testRepositoryRead() throws IOException {
+        // Create an InputStream with random values
+        int TEST_ARRAY_SIZE = 64 * 1024 * 10;
+        byte[] byteArray = new byte[TEST_ARRAY_SIZE];
+        Random random = new Random();
+        random.nextBytes(byteArray);
+        InputStream randomStream = new ByteArrayInputStream(byteArray);
+
+        // Create a test segment that we will read/write from
+        Directory directory;
+        directory = newFSDirectory(createTempDir());
+        String TEST_SEGMENT_NAME = "test-segment-name";
+        IndexOutput testIndexOutput = directory.createOutput(TEST_SEGMENT_NAME, IOContext.DEFAULT);
+
+        // Set up RemoteIndexBuildStrategy and write to IndexOutput
+        RepositoriesService repositoriesService = mock(RepositoriesService.class);
+        BlobStoreRepository mockRepository = mock(BlobStoreRepository.class);
+        BlobPath testBasePath = new BlobPath().add("testBasePath");
+        BlobStore mockBlobStore = mock(BlobStore.class);
+        AsyncMultiStreamBlobContainer mockBlobContainer = mock(AsyncMultiStreamBlobContainer.class);
+
+        when(repositoriesService.repository(any())).thenReturn(mockRepository);
+        when(mockRepository.basePath()).thenReturn(testBasePath);
+        when(mockRepository.blobStore()).thenReturn(mockBlobStore);
+        when(mockBlobStore.blobContainer(any())).thenReturn(mockBlobContainer);
+        when(mockBlobContainer.readBlob("test-blob")).thenReturn(randomStream);
+
+        RemoteIndexBuildStrategy objectUnderTest = new RemoteIndexBuildStrategy(
+            () -> repositoriesService,
+            mock(NativeIndexBuildStrategy.class),
+            mock(IndexSettings.class)
+        );
+        // This should read from randomStream into testIndexOutput
+        objectUnderTest.readFromRepository("test-blob", testIndexOutput);
+        testIndexOutput.close();
+
+        // Now try to read from the IndexOutput
+        IndexInput testIndexInput = directory.openInput(TEST_SEGMENT_NAME, IOContext.DEFAULT);
+        byte[] resultByteArray = new byte[TEST_ARRAY_SIZE];
+        testIndexInput.readBytes(resultByteArray, 0, TEST_ARRAY_SIZE);
+        assertArrayEquals(byteArray, resultByteArray);
+
+        // Test Cleanup
+        testIndexInput.close();
+        directory.close();
+    }
 }