|
| 1 | +/* |
| 2 | + * Copyright OpenSearch Contributors |
| 3 | + * SPDX-License-Identifier: Apache-2.0 |
| 4 | + */ |
| 5 | + |
| 6 | +package org.opensearch.knn.memoryoptsearch.faiss; |
| 7 | + |
| 8 | +import lombok.Getter; |
| 9 | +import org.apache.lucene.store.IndexInput; |
| 10 | + |
| 11 | +import java.io.IOException; |
| 12 | + |
| 13 | +/** |
| 14 | + * Ported implementation of the FAISS HNSW graph search algorithm. |
| 15 | + * While it follows the same steps as the original FAISS implementation, differences in how the JVM and C++ handle floating-point |
| 16 | + * calculations can lead to slight variations in results. However, such cases are very rare, and in most instances, the results are |
| 17 | + * identical to FAISS. Even when there are ranking differences, they do not impact the precision or recall of the search. |
| 18 | + * For more details, refer to the [FAISS HNSW implementation]( |
| 19 | + * <a href="https://github.com/facebookresearch/faiss/blob/main/faiss/impl/HNSW.h">...</a>). |
| 20 | + */ |
| 21 | +@Getter |
| 22 | +public class FaissHNSW { |
| 23 | + // Cumulative number of neighbors per each level. |
| 24 | + private int[] cumNumberNeighborPerLevel; |
| 25 | + // Offset to be added to cumNumberNeighborPerLevel[level] to get the actual start offset of neighbor list. |
| 26 | + private long[] offsets = null; |
| 27 | + // Neighbor list storage. |
| 28 | + private final FaissSection neighbors = new FaissSection(); |
| 29 | + // Entry point in HNSW graph |
| 30 | + private int entryPoint; |
| 31 | + // Maximum level of HNSW graph |
| 32 | + private int maxLevel = -1; |
| 33 | + // Default efSearch parameter. This determines the navigation queue size. |
| 34 | + // More value, algorithm will more navigate candidates. |
| 35 | + private int efSearch = 16; |
| 36 | + // Total number of vectors stored in graph. |
| 37 | + private long totalNumberOfVectors; |
| 38 | + |
| 39 | + /** |
| 40 | + * Partially loads the FAISS HNSW graph from the provided index input stream. |
| 41 | + * The graph is divided into multiple sections, and this method marks the starting offset of each section then skip to the next |
| 42 | + * section instead of loading the entire graph into memory. During the search, bytes will be accessed via {@link IndexInput}. |
| 43 | + * |
| 44 | + * @param input An input stream for a FAISS HNSW graph file, allowing access to the neighbor list and vector locations. |
| 45 | + * @param totalNumberOfVectors The total number of vectors stored in the graph. |
| 46 | + * @return {@link FaissHNSW}, a graph search structure that represents the FAISS HNSW graph |
| 47 | + * |
| 48 | + * FYI <a href="https://github.com/facebookresearch/faiss/blob/main/faiss/impl/index_read.cpp#L363">FAISS Deserialization</a> |
| 49 | + * |
| 50 | + * @throws IOException |
| 51 | + */ |
| 52 | + public static FaissHNSW load(IndexInput input, long totalNumberOfVectors) throws IOException { |
| 53 | + // Total number of vectors |
| 54 | + FaissHNSW faissHNSW = new FaissHNSW(); |
| 55 | + faissHNSW.totalNumberOfVectors = totalNumberOfVectors; |
| 56 | + |
| 57 | + // We don't use `double[] assignProbas` for search. It is for index construction. |
| 58 | + long size = input.readLong(); |
| 59 | + input.skipBytes(Double.BYTES * size); |
| 60 | + |
| 61 | + // Accumulate number of neighbor per each level. |
| 62 | + size = input.readLong(); |
| 63 | + faissHNSW.cumNumberNeighborPerLevel = new int[(int) size]; |
| 64 | + if (size > 0) { |
| 65 | + input.readInts(faissHNSW.cumNumberNeighborPerLevel, 0, (int) size); |
| 66 | + } |
| 67 | + |
| 68 | + // We don't use `level`. |
| 69 | + final FaissSection levels = new FaissSection(); |
| 70 | + levels.markSection(input, Integer.BYTES); |
| 71 | + |
| 72 | + // Load `offsets` into memory. |
| 73 | + size = input.readLong(); |
| 74 | + faissHNSW.offsets = new long[(int) size]; |
| 75 | + input.readLongs(faissHNSW.offsets, 0, faissHNSW.offsets.length); |
| 76 | + |
| 77 | + // Mark neighbor list section. |
| 78 | + faissHNSW.neighbors.markSection(input, Integer.BYTES); |
| 79 | + |
| 80 | + // HNSW graph parameters |
| 81 | + faissHNSW.entryPoint = input.readInt(); |
| 82 | + |
| 83 | + faissHNSW.maxLevel = input.readInt(); |
| 84 | + |
| 85 | + // We don't use this field. It's for index building. |
| 86 | + final int efConstruction = input.readInt(); |
| 87 | + |
| 88 | + faissHNSW.efSearch = input.readInt(); |
| 89 | + |
| 90 | + // dummy read a deprecated field. |
| 91 | + input.readInt(); |
| 92 | + |
| 93 | + return faissHNSW; |
| 94 | + } |
| 95 | +} |
0 commit comments