Add integrity checking to VectorRepositoryAccessor

jed326 · Jay Deng · commit 8928fdf9128f · 2025-03-04T17:03:37.000-08:00
Signed-off-by: Jay Deng &lt;jayd0104@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 * [Remote Vector Index Build] Implement data download and IndexOutput write functionality [#2554](https://github.com/opensearch-project/k-NN/pull/2554)
 * [Remote Vector Index Build] Introduce Client Skeleton + basic Build Request implementation [#2560](https://github.com/opensearch-project/k-NN/pull/2560)
 * Add concurrency optimizations with native memory graph loading and force eviction (#2265) [https://github.com/opensearch-project/k-NN/pull/2345]
+* [Remote Vector Index Build] Add integrity checking to VectorRepositoryAccessor [#2578](https://github.com/opensearch-project/k-NN/pull/2578)
 ### Enhancements
 * Introduce node level circuit breakers for k-NN [#2509](https://github.com/opensearch-project/k-NN/pull/2509)
 ### Bug Fixes
diff --git a/src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/DefaultVectorRepositoryAccessor.java b/src/main/java/org/opensearch/knn/index/codec/nativeindex/remote/DefaultVectorRepositoryAccessor.java
@@ -5,6 +5,7 @@
 
 package org.opensearch.knn.index.codec.nativeindex.remote;
 
+import com.google.common.annotations.VisibleForTesting;
 import lombok.AllArgsConstructor;
 import lombok.extern.log4j.Log4j2;
 import org.opensearch.action.LatchedActionListener;
@@ -32,6 +33,8 @@
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
+import java.util.zip.CRC32;
+import java.util.zip.CheckedInputStream;
 
 import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.remote.KNNRemoteConstants.DOC_ID_FILE_EXTENSION;
@@ -74,50 +77,42 @@ public void writeToRepository(
         initializeVectorValues(knnVectorValues);
         long vectorBlobLength = (long) knnVectorValues.bytesPerVector() * totalLiveDocs;
 
-        if (blobContainer instanceof AsyncMultiStreamBlobContainer) {
+        if (blobContainer instanceof AsyncMultiStreamBlobContainer asyncBlobContainer) {
             // First initiate vectors upload
             log.debug("Repository {} Supports Parallel Blob Upload", repository);
             // WriteContext is the main entry point into asyncBlobUpload. It stores all of our upload configurations, analogous to
             // BuildIndexParams
-            WriteContext writeContext = new WriteContext.Builder().fileName(blobName + VECTOR_BLOB_FILE_EXTENSION)
-                .streamContextSupplier((partSize) -> getStreamContext(partSize, vectorBlobLength, knnVectorValuesSupplier, vectorDataType))
-                .fileSize(vectorBlobLength)
-                .failIfAlreadyExists(true)
-                .writePriority(WritePriority.NORMAL)
-                // TODO: Checksum implementations -- It is difficult to calculate a checksum on the knnVectorValues as
-                // there is no underlying file upon which we can create the checksum. We should be able to create a
-                // checksum still by iterating through once, however this will be an expensive operation.
-                .uploadFinalizer((bool) -> {})
-                .doRemoteDataIntegrityCheck(false)
-                .expectedChecksum(null)
-                .build();
+            WriteContext writeContext = createWriteContext(
+                blobName,
+                vectorBlobLength,
+                knnVectorValuesSupplier,
+                vectorDataType,
+                asyncBlobContainer.remoteIntegrityCheckSupported()
+            );
 
             AtomicReference<Exception> exception = new AtomicReference<>();
             final CountDownLatch latch = new CountDownLatch(1);
-            ((AsyncMultiStreamBlobContainer) blobContainer).asyncBlobUpload(
-                writeContext,
-                new LatchedActionListener<>(new ActionListener<>() {
-                    @Override
-                    public void onResponse(Void unused) {
-                        log.debug(
-                            "Parallel vector upload succeeded for blob {} with size {}",
-                            blobName + VECTOR_BLOB_FILE_EXTENSION,
-                            vectorBlobLength
-                        );
-                    }
-
-                    @Override
-                    public void onFailure(Exception e) {
-                        log.error(
-                            "Parallel vector upload failed for blob {} with size {}",
-                            blobName + VECTOR_BLOB_FILE_EXTENSION,
-                            vectorBlobLength,
-                            e
-                        );
-                        exception.set(e);
-                    }
-                }, latch)
-            );
+            asyncBlobContainer.asyncBlobUpload(writeContext, new LatchedActionListener<>(new ActionListener<>() {
+                @Override
+                public void onResponse(Void unused) {
+                    log.debug(
+                        "Parallel vector upload succeeded for blob {} with size {}",
+                        blobName + VECTOR_BLOB_FILE_EXTENSION,
+                        vectorBlobLength
+                    );
+                }
+
+                @Override
+                public void onFailure(Exception e) {
+                    log.error(
+                        "Parallel vector upload failed for blob {} with size {}",
+                        blobName + VECTOR_BLOB_FILE_EXTENSION,
+                        vectorBlobLength,
+                        e
+                    );
+                    exception.set(e);
+                }
+            }, latch));
 
             // Then upload doc id blob before waiting on vector uploads
             // TODO: We wrap with a BufferedInputStream to support retries. We can tune this buffer size to optimize performance.
@@ -215,6 +210,61 @@ private CheckedTriFunction<Integer, Long, Long, InputStreamContainer, IOExceptio
         });
     }
 
+    /**
+     * Creates a {@link WriteContext} meant to be used by {@link AsyncMultiStreamBlobContainer#asyncBlobUpload}. If integrity checking is supported, calculates a checksum as well.
+     * @param blobName
+     * @param vectorBlobLength
+     * @param knnVectorValuesSupplier
+     * @param vectorDataType
+     * @param supportsIntegrityCheck
+     * @return
+     * @throws IOException
+     */
+    private WriteContext createWriteContext(
+        String blobName,
+        long vectorBlobLength,
+        Supplier<KNNVectorValues<?>> knnVectorValuesSupplier,
+        VectorDataType vectorDataType,
+        boolean supportsIntegrityCheck
+    ) throws IOException {
+        return new WriteContext.Builder().fileName(blobName + VECTOR_BLOB_FILE_EXTENSION)
+            .streamContextSupplier((partSize) -> getStreamContext(partSize, vectorBlobLength, knnVectorValuesSupplier, vectorDataType))
+            .fileSize(vectorBlobLength)
+            .failIfAlreadyExists(true)
+            .writePriority(WritePriority.NORMAL)
+            .doRemoteDataIntegrityCheck(supportsIntegrityCheck)
+            .uploadFinalizer((bool) -> {})
+            .expectedChecksum(supportsIntegrityCheck ? getExpectedChecksum(knnVectorValuesSupplier.get(), vectorDataType) : null)
+            .build();
+    }
+
+    /**
+     * Calculates a checksum on the given {@link KNNVectorValues}, representing all the vector data for the index build operation.
+     * This is done by creating a {@link VectorValuesInputStream} which is wrapped by a {@link CheckedInputStream} and then reading all the data through the stream to calculate the checksum.
+     * Note: This does add some overhead to the vector blob upload, as we are reading through the KNNVectorValues an additional time. If instead of taking an expected checksum up front
+     * the WriteContext accepted an expectedChecksumSupplier, we could calculate the checksum as the stream is being uploaded and use that same value to compare, however this is pending
+     * a change in OpenSearch core.
+     *
+     * @param knnVectorValues
+     * @param vectorDataType
+     * @return
+     * @throws IOException
+     */
+    @VisibleForTesting
+    long getExpectedChecksum(KNNVectorValues<?> knnVectorValues, VectorDataType vectorDataType) throws IOException {
+        initializeVectorValues(knnVectorValues);
+        CheckedInputStream checkedStream = new CheckedInputStream(
+            new VectorValuesInputStream(knnVectorValues, vectorDataType),
+            new CRC32()
+        );
+        // VectorValuesInputStream#read only reads 1 vector max at a time, so no point making this buffer any larger than that
+        int bufferSize = knnVectorValues.bytesPerVector();
+        final byte[] buffer = new byte[bufferSize];
+        while (checkedStream.read(buffer, 0, bufferSize) != -1) {
+        }
+        return checkedStream.getChecksum().getValue();
+    }
+
     @Override
     public void readFromRepository(String path, IndexOutputWithBuffer indexOutputWithBuffer) throws IOException {
         if (path == null || path.isEmpty()) {
diff --git a/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/DefaultVectorRepositoryAccessorTests.java b/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/DefaultVectorRepositoryAccessorTests.java
@@ -35,9 +35,9 @@
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
+import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.remote.KNNRemoteConstants.DOC_ID_FILE_EXTENSION;
 import static org.opensearch.knn.index.remote.KNNRemoteConstants.VECTOR_BLOB_FILE_EXTENSION;
-import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 
 public class DefaultVectorRepositoryAccessorTests extends RemoteIndexBuildTests {
 
@@ -109,7 +109,7 @@ public void testRepositoryInteractionWithBlobContainer() throws IOException, Int
     /**
      * Test that when an exception is thrown during asyncBlobUpload, the exception is rethrown.
      */
-    public void testAsyncUploadThrowsException() throws InterruptedException, IOException {
+    public void testAsyncUploadThrowsException() throws IOException {
         RepositoriesService repositoriesService = mock(RepositoriesService.class);
         BlobStoreRepository mockRepository = mock(BlobStoreRepository.class);
         BlobPath testBasePath = new BlobPath().add("testBasePath");
diff --git a/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/KnnVectorValuesInputStreamTests.java b/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/KnnVectorValuesInputStreamTests.java
@@ -6,11 +6,13 @@
 package org.opensearch.knn.index.codec.nativeindex.remote;
 
 import org.apache.lucene.search.DocIdSetIterator;
+import org.opensearch.index.IndexSettings;
 import org.opensearch.knn.KNNTestCase;
 import org.opensearch.knn.index.VectorDataType;
 import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
 import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory;
 import org.opensearch.knn.index.vectorvalues.TestVectorValues;
+import org.opensearch.repositories.blobstore.BlobStoreRepository;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -20,7 +22,10 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Supplier;
+import java.util.zip.CRC32;
+import java.util.zip.Checksum;
 
+import static org.mockito.Mockito.mock;
 import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.vectorvalues.TestVectorValues.getRandomByteVector;
 import static org.opensearch.knn.index.vectorvalues.TestVectorValues.getRandomVector;
@@ -263,6 +268,37 @@ public void testDocIdInputStreamReadByte() throws IOException {
         assertArrayEquals(bufferRead.array(), bufferReadByByte.array());
     }
 
+    /**
+     * Test that calculating the checksum in parts yields the same result as calculating the checksum on the whole stream
+     */
+    public void testVectorValuesChecksum() throws IOException {
+        final int NUM_DOCS = randomIntBetween(100, 1000);
+        final int NUM_DIMENSION = randomIntBetween(1, 1000);
+
+        List<float[]> vectorValues = getRandomFloatVectors(NUM_DOCS, NUM_DIMENSION);
+        final Supplier<TestVectorValues.PreDefinedFloatVectorValues> randomVectorValuesSupplier =
+            () -> new TestVectorValues.PreDefinedFloatVectorValues(vectorValues);
+
+        final Supplier<KNNVectorValues<float[]>> knnVectorValuesSupplier = () -> KNNVectorValuesFactory.getVectorValues(
+            VectorDataType.FLOAT,
+            randomVectorValuesSupplier.get()
+        );
+
+        // Get checksum from VectorRepositoryAccessor
+        DefaultVectorRepositoryAccessor vectorRepositoryAccessor = new DefaultVectorRepositoryAccessor(
+            mock(BlobStoreRepository.class),
+            mock(IndexSettings.class)
+        );
+        long expectedChecksum = vectorRepositoryAccessor.getExpectedChecksum(knnVectorValuesSupplier.get(), VectorDataType.FLOAT);
+
+        // Get checksum by reading the entire stream
+        InputStream inputStream = new VectorValuesInputStream(knnVectorValuesSupplier.get(), VectorDataType.FLOAT);
+        Checksum actualChecksum = new CRC32();
+        actualChecksum.update(inputStream.readAllBytes());
+
+        assertEquals(expectedChecksum, actualChecksum.getValue());
+    }
+
     private List<float[]> getRandomFloatVectors(int numDocs, int dimension) {
         ArrayList<float[]> vectorValues = new ArrayList<>();
         for (int i = 0; i < numDocs; i++) {
diff --git a/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildTests.java b/src/test/java/org/opensearch/knn/index/codec/nativeindex/remote/RemoteIndexBuildTests.java
@@ -127,7 +127,7 @@ public void readBlobAsync(String s, ActionListener<ReadContext> actionListener)
 
         @Override
         public boolean remoteIntegrityCheckSupported() {
-            return false;
+            return true;
         }
 
         @Override

Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ public void readBlobAsync(String s, ActionListener<ReadContext> actionListener)`
`127`	`127`
`128`	`128`	`@Override`
`129`	`129`	`public boolean remoteIntegrityCheckSupported() {`
`130`		`- return false;`
	`130`	`+ return true;`
`131`	`131`	`}`
`132`	`132`
`133`	`133`	`@Override`