Skip to content

Commit 0a88963

Browse files
authored
Enable Fuzzy codec for doc id fields using a bloom filter (opensearch-project#11027)
* Enable Fuzzy codec for doc id fields using a bloom filter Signed-off-by: mgodwan <mgodwan@amazon.com>
1 parent aad2630 commit 0a88963

24 files changed

+1538
-3
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
137137
- New DateTime format for RFC3339 compatible date fields ([#11465](https://github.com/opensearch-project/OpenSearch/pull/11465))
138138
- Add support for Google Application Default Credentials in repository-gcs ([#8394](https://github.com/opensearch-project/OpenSearch/pull/8394))
139139
- Remove concurrent segment search feature flag for GA launch ([#12074](https://github.com/opensearch-project/OpenSearch/pull/12074))
140+
- Enable Fuzzy codec for doc id fields using a bloom filter ([#11022](https://github.com/opensearch-project/OpenSearch/pull/11022))
140141

141142
### Dependencies
142143
- Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.benchmark.index.codec.fuzzy;
10+
11+
import org.apache.lucene.util.BytesRef;
12+
import org.opensearch.common.UUIDs;
13+
import org.opensearch.index.codec.fuzzy.FuzzySet;
14+
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
15+
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
16+
import org.opensearch.index.mapper.IdFieldMapper;
17+
import org.openjdk.jmh.annotations.Benchmark;
18+
import org.openjdk.jmh.annotations.BenchmarkMode;
19+
import org.openjdk.jmh.annotations.Fork;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Param;
24+
import org.openjdk.jmh.annotations.Scope;
25+
import org.openjdk.jmh.annotations.Setup;
26+
import org.openjdk.jmh.annotations.State;
27+
import org.openjdk.jmh.annotations.Warmup;
28+
29+
import java.io.IOException;
30+
import java.util.List;
31+
import java.util.Map;
32+
import java.util.concurrent.TimeUnit;
33+
import java.util.stream.Collectors;
34+
import java.util.stream.IntStream;
35+
36+
@Fork(3)
37+
@Warmup(iterations = 2)
38+
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
39+
@BenchmarkMode(Mode.AverageTime)
40+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
41+
@State(Scope.Benchmark)
42+
public class FilterConstructionBenchmark {
43+
44+
private List<BytesRef> items;
45+
46+
@Param({ "1000000", "10000000", "50000000" })
47+
private int numIds;
48+
49+
@Param({ "0.0511", "0.1023", "0.2047" })
50+
private double fpp;
51+
52+
private FuzzySetFactory fuzzySetFactory;
53+
private String fieldName;
54+
55+
@Setup
56+
public void setupIds() {
57+
this.fieldName = IdFieldMapper.NAME;
58+
this.items = IntStream.range(0, numIds).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
59+
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
60+
this.fuzzySetFactory = new FuzzySetFactory(Map.of(fieldName, parameters));
61+
}
62+
63+
@Benchmark
64+
public FuzzySet buildFilter() throws IOException {
65+
return fuzzySetFactory.createFuzzySet(items.size(), fieldName, () -> items.iterator());
66+
}
67+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.benchmark.index.codec.fuzzy;
10+
11+
import org.apache.lucene.util.BytesRef;
12+
import org.opensearch.common.UUIDs;
13+
import org.opensearch.index.codec.fuzzy.FuzzySet;
14+
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
15+
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
16+
import org.opensearch.index.mapper.IdFieldMapper;
17+
import org.openjdk.jmh.annotations.Benchmark;
18+
import org.openjdk.jmh.annotations.BenchmarkMode;
19+
import org.openjdk.jmh.annotations.Fork;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Param;
24+
import org.openjdk.jmh.annotations.Scope;
25+
import org.openjdk.jmh.annotations.Setup;
26+
import org.openjdk.jmh.annotations.State;
27+
import org.openjdk.jmh.annotations.Warmup;
28+
import org.openjdk.jmh.infra.Blackhole;
29+
30+
import java.io.IOException;
31+
import java.util.List;
32+
import java.util.Map;
33+
import java.util.Random;
34+
import java.util.concurrent.TimeUnit;
35+
import java.util.stream.Collectors;
36+
import java.util.stream.IntStream;
37+
38+
@Fork(3)
39+
@Warmup(iterations = 2)
40+
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
41+
@BenchmarkMode(Mode.AverageTime)
42+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
43+
@State(Scope.Benchmark)
44+
public class FilterLookupBenchmark {
45+
46+
@Param({ "50000000", "1000000" })
47+
private int numItems;
48+
49+
@Param({ "1000000" })
50+
private int searchKeyCount;
51+
52+
@Param({ "0.0511", "0.1023", "0.2047" })
53+
private double fpp;
54+
55+
private FuzzySet fuzzySet;
56+
private List<BytesRef> items;
57+
private Random random = new Random();
58+
59+
@Setup
60+
public void setupFilter() throws IOException {
61+
String fieldName = IdFieldMapper.NAME;
62+
items = IntStream.range(0, numItems).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
63+
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
64+
fuzzySet = new FuzzySetFactory(Map.of(fieldName, parameters)).createFuzzySet(numItems, fieldName, () -> items.iterator());
65+
}
66+
67+
@Benchmark
68+
public void contains_withExistingKeys(Blackhole blackhole) throws IOException {
69+
for (int i = 0; i < searchKeyCount; i++) {
70+
blackhole.consume(fuzzySet.contains(items.get(random.nextInt(items.size()))) == FuzzySet.Result.MAYBE);
71+
}
72+
}
73+
74+
@Benchmark
75+
public void contains_withRandomKeys(Blackhole blackhole) throws IOException {
76+
for (int i = 0; i < searchKeyCount; i++) {
77+
blackhole.consume(fuzzySet.contains(new BytesRef(UUIDs.base64UUID())));
78+
}
79+
}
80+
}

qa/rolling-upgrade/build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ for (Version bwcVersion : BuildParams.bwcVersions.wireCompatible) {
6262
setting 'repositories.url.allowed_urls', 'http://snapshot.test*'
6363
setting 'path.repo', "${buildDir}/cluster/shared/repo/${baseName}"
6464
setting 'http.content_type.required', 'true'
65+
systemProperty 'opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled', 'true'
6566
}
6667
}
6768

qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java

+87-1
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@
4040
import org.opensearch.common.Booleans;
4141
import org.opensearch.common.io.Streams;
4242
import org.opensearch.common.settings.Settings;
43+
import org.opensearch.index.IndexSettings;
4344
import org.opensearch.index.codec.CodecService;
4445
import org.opensearch.index.engine.EngineConfig;
4546
import org.opensearch.indices.replication.common.ReplicationType;
46-
import org.opensearch.test.OpenSearchIntegTestCase;
4747
import org.opensearch.test.rest.yaml.ObjectPath;
4848

4949
import java.io.IOException;
@@ -344,6 +344,92 @@ public void testIndexingWithSegRep() throws Exception {
344344
}
345345
}
346346

347+
public void testIndexingWithFuzzyFilterPostings() throws Exception {
348+
if (UPGRADE_FROM_VERSION.onOrBefore(Version.V_2_11_1)) {
349+
logger.info("--> Skip test for version {} where fuzzy filter postings format feature is not available", UPGRADE_FROM_VERSION);
350+
return;
351+
}
352+
final String indexName = "test-index-fuzzy-set";
353+
final int shardCount = 3;
354+
final int replicaCount = 1;
355+
logger.info("--> Case {}", CLUSTER_TYPE);
356+
printClusterNodes();
357+
logger.info("--> _cat/shards before test execution \n{}", EntityUtils.toString(client().performRequest(new Request("GET", "/_cat/shards?v")).getEntity()));
358+
switch (CLUSTER_TYPE) {
359+
case OLD:
360+
Settings.Builder settings = Settings.builder()
361+
.put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), shardCount)
362+
.put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), replicaCount)
363+
.put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
364+
.put(
365+
EngineConfig.INDEX_CODEC_SETTING.getKey(),
366+
randomFrom(new ArrayList<>(CODECS) {
367+
{
368+
add(CodecService.LUCENE_DEFAULT_CODEC);
369+
}
370+
})
371+
)
372+
.put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms");
373+
createIndex(indexName, settings.build());
374+
waitForClusterHealthWithNoShardMigration(indexName, "green");
375+
bulk(indexName, "_OLD", 5);
376+
break;
377+
case MIXED:
378+
waitForClusterHealthWithNoShardMigration(indexName, "yellow");
379+
break;
380+
case UPGRADED:
381+
Settings.Builder settingsBuilder = Settings.builder()
382+
.put(IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING.getKey(), true);
383+
updateIndexSettings(indexName, settingsBuilder);
384+
waitForClusterHealthWithNoShardMigration(indexName, "green");
385+
break;
386+
default:
387+
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
388+
}
389+
390+
int expectedCount;
391+
switch (CLUSTER_TYPE) {
392+
case OLD:
393+
expectedCount = 5;
394+
break;
395+
case MIXED:
396+
if (Booleans.parseBoolean(System.getProperty("tests.first_round"))) {
397+
expectedCount = 5;
398+
} else {
399+
expectedCount = 10;
400+
}
401+
break;
402+
case UPGRADED:
403+
expectedCount = 15;
404+
break;
405+
default:
406+
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
407+
}
408+
409+
waitForSearchableDocs(indexName, shardCount, replicaCount);
410+
assertCount(indexName, expectedCount);
411+
412+
if (CLUSTER_TYPE != ClusterType.OLD) {
413+
bulk(indexName, "_" + CLUSTER_TYPE, 5);
414+
logger.info("--> Index one doc (to be deleted next) and verify doc count");
415+
Request toBeDeleted = new Request("PUT", "/" + indexName + "/_doc/to_be_deleted");
416+
toBeDeleted.addParameter("refresh", "true");
417+
toBeDeleted.setJsonEntity("{\"f1\": \"delete-me\"}");
418+
client().performRequest(toBeDeleted);
419+
waitForSearchableDocs(indexName, shardCount, replicaCount);
420+
assertCount(indexName, expectedCount + 6);
421+
422+
logger.info("--> Delete previously added doc and verify doc count");
423+
Request delete = new Request("DELETE", "/" + indexName + "/_doc/to_be_deleted");
424+
delete.addParameter("refresh", "true");
425+
client().performRequest(delete);
426+
waitForSearchableDocs(indexName, shardCount, replicaCount);
427+
assertCount(indexName, expectedCount + 5);
428+
429+
//forceMergeAndVerify(indexName, shardCount * (1 + replicaCount));
430+
}
431+
}
432+
347433
public void testAutoIdWithOpTypeCreate() throws IOException {
348434
final String indexName = "auto_id_and_op_type_create_index";
349435
StringBuilder b = new StringBuilder();

server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ protected FeatureFlagSettings(
3434
FeatureFlags.IDENTITY_SETTING,
3535
FeatureFlags.TELEMETRY_SETTING,
3636
FeatureFlags.DATETIME_FORMATTER_CACHING_SETTING,
37-
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING
37+
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING,
38+
FeatureFlags.DOC_ID_FUZZY_SET_SETTING
3839
);
3940
}

server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java

+3
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
229229
IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING,
230230
IndexMetadata.INDEX_REMOTE_TRANSLOG_REPOSITORY_SETTING,
231231

232+
IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING,
233+
IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,
234+
232235
// Settings for concurrent segment search
233236
IndexSettings.INDEX_CONCURRENT_SEGMENT_SEARCH_SETTING,
234237

server/src/main/java/org/opensearch/common/util/FeatureFlags.java

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ public class FeatureFlags {
5454
*/
5555
public static final String WRITEABLE_REMOTE_INDEX = "opensearch.experimental.feature.writeable_remote_index.enabled";
5656

57+
/**
58+
* Gates the optimization to enable bloom filters for doc id lookup.
59+
*/
60+
public static final String DOC_ID_FUZZY_SET = "opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled";
61+
5762
/**
5863
* Should store the settings from opensearch.yml.
5964
*/
@@ -110,4 +115,6 @@ public static boolean isEnabled(Setting<Boolean> featureFlag) {
110115
false,
111116
Property.NodeScope
112117
);
118+
119+
public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(DOC_ID_FUZZY_SET, false, Property.NodeScope);
113120
}

0 commit comments

Comments
 (0)