Skip to content

Commit 9661e8d

Browse files
authored
fix: MinHash token filter parameters not working (#15233)
* fix: minhash configuration Signed-off-by: Matt Ridehalgh <mrideh@amazon.co.uk> * style: linting Signed-off-by: Matt Ridehalgh <mrideh@amazon.co.uk> * chore: update CHANGELOG.md Signed-off-by: Matt Ridehalgh <mrideh@amazon.co.uk> --------- Signed-off-by: Matt Ridehalgh <mrideh@amazon.co.uk>
1 parent a900a16 commit 9661e8d

File tree

3 files changed

+58
-15
lines changed

3 files changed

+58
-15
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3434
- Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111))
3535
- Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207))
3636
- Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206))
37-
- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
37+
- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
3838
- Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
3939

4040
### Changed
@@ -53,6 +53,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5353
- Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
5454
- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
5555
- Fix range aggregation optimization ignoring top level queries ([#15194](https://github.com/opensearch-project/OpenSearch/pull/15194))
56+
- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233))
5657

5758
### Security
5859

modules/analysis-common/src/main/java/org/opensearch/analysis/common/MinHashTokenFilterFactory.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ private Map<String, String> convertSettings(Settings settings) {
6565
if (settings.hasValue("hash_count")) {
6666
settingMap.put("hashCount", settings.get("hash_count"));
6767
}
68-
if (settings.hasValue("bucketCount")) {
68+
if (settings.hasValue("bucket_count")) {
6969
settingMap.put("bucketCount", settings.get("bucket_count"));
7070
}
71-
if (settings.hasValue("hashSetSize")) {
71+
if (settings.hasValue("hash_set_size")) {
7272
settingMap.put("hashSetSize", settings.get("hash_set_size"));
7373
}
7474
if (settings.hasValue("with_rotation")) {

modules/analysis-common/src/test/java/org/opensearch/analysis/common/MinHashFilterFactoryTests.java

+54-12
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,10 @@ public void testDefault() throws IOException {
5050
int default_bucket_size = 512;
5151
int default_hash_set_size = 1;
5252
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
53-
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
54-
settings,
55-
new CommonAnalysisModulePlugin()
56-
);
53+
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
5754
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
5855
String source = "the quick brown fox";
59-
Tokenizer tokenizer = new WhitespaceTokenizer();
60-
tokenizer.setReader(new StringReader(source));
56+
Tokenizer tokenizer = getTokenizer(source);
6157

6258
// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
6359
// have enough tokens to fill all the buckets, we still expect 512 tokens.
@@ -73,17 +69,63 @@ public void testSettings() throws IOException {
7369
.put("index.analysis.filter.test_min_hash.with_rotation", false)
7470
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
7571
.build();
76-
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
77-
settings,
78-
new CommonAnalysisModulePlugin()
79-
);
72+
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
8073
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
8174
String source = "sushi";
82-
Tokenizer tokenizer = new WhitespaceTokenizer();
83-
tokenizer.setReader(new StringReader(source));
75+
Tokenizer tokenizer = getTokenizer(source);
8476

8577
// despite the fact that bucket_count is 2 and hash_set_size is 1,
8678
// because with_rotation is false, we only expect 1 token here.
8779
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
8880
}
81+
82+
public void testBucketCountSetting() throws IOException {
83+
// Correct case with "bucket_count"
84+
Settings settingsWithBucketCount = Settings.builder()
85+
.put("index.analysis.filter.test_min_hash.type", "min_hash")
86+
.put("index.analysis.filter.test_min_hash.hash_count", "1")
87+
.put("index.analysis.filter.test_min_hash.bucket_count", "3")
88+
.put("index.analysis.filter.test_min_hash.hash_set_size", "1")
89+
.put("index.analysis.filter.test_min_hash.with_rotation", false)
90+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
91+
.build();
92+
93+
OpenSearchTestCase.TestAnalysis analysisWithBucketCount = getTestAnalysisFromSettings(settingsWithBucketCount);
94+
95+
TokenFilterFactory tokenFilterWithBucketCount = analysisWithBucketCount.tokenFilter.get("test_min_hash");
96+
String sourceWithBucketCount = "salmon avocado roll uramaki";
97+
Tokenizer tokenizerWithBucketCount = getTokenizer(sourceWithBucketCount);
98+
// Expect 3 tokens due to bucket_count being set to 3
99+
assertStreamHasNumberOfTokens(tokenFilterWithBucketCount.create(tokenizerWithBucketCount), 3);
100+
}
101+
102+
public void testHashSetSizeSetting() throws IOException {
103+
// Correct case with "hash_set_size"
104+
Settings settingsWithHashSetSize = Settings.builder()
105+
.put("index.analysis.filter.test_min_hash.type", "min_hash")
106+
.put("index.analysis.filter.test_min_hash.hash_count", "1")
107+
.put("index.analysis.filter.test_min_hash.bucket_count", "1")
108+
.put("index.analysis.filter.test_min_hash.hash_set_size", "2")
109+
.put("index.analysis.filter.test_min_hash.with_rotation", false)
110+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
111+
.build();
112+
113+
OpenSearchTestCase.TestAnalysis analysisWithHashSetSize = getTestAnalysisFromSettings(settingsWithHashSetSize);
114+
115+
TokenFilterFactory tokenFilterWithHashSetSize = analysisWithHashSetSize.tokenFilter.get("test_min_hash");
116+
String sourceWithHashSetSize = "salmon avocado roll uramaki";
117+
Tokenizer tokenizerWithHashSetSize = getTokenizer(sourceWithHashSetSize);
118+
// Expect 2 tokens due to hash_set_size being set to 2 and bucket_count being 1
119+
assertStreamHasNumberOfTokens(tokenFilterWithHashSetSize.create(tokenizerWithHashSetSize), 2);
120+
}
121+
122+
private static OpenSearchTestCase.TestAnalysis getTestAnalysisFromSettings(Settings settingsWithBucketCount) throws IOException {
123+
return AnalysisTestsHelper.createTestAnalysisFromSettings(settingsWithBucketCount, new CommonAnalysisModulePlugin());
124+
}
125+
126+
private static Tokenizer getTokenizer(String sourceWithBucketCount) {
127+
Tokenizer tokenizerWithBucketCount = new WhitespaceTokenizer();
128+
tokenizerWithBucketCount.setReader(new StringReader(sourceWithBucketCount));
129+
return tokenizerWithBucketCount;
130+
}
89131
}

0 commit comments

Comments
 (0)