Skip to content

Commit 5068fad

Browse files
Add a new configuration setting synonym_analyzer for synonym_graph and synonym. (opensearch-project#16488)
* Add custom synonym_analyzer * synonym_analyzer configuration setting --------- Signed-off-by: Prudhvi Godithi <pgodithi@amazon.com>
1 parent 53d41d3 commit 5068fad

File tree

10 files changed

+250
-16
lines changed

10 files changed

+250
-16
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1717
- Support for keyword fields in star-tree index ([#16233](https://github.com/opensearch-project/OpenSearch/pull/16233))
1818
- Add a flag in QueryShardContext to differentiate inner hit query ([#16600](https://github.com/opensearch-project/OpenSearch/pull/16600))
1919
- Add vertical scaling and SoftReference for snapshot repository data cache ([#16489](https://github.com/opensearch-project/OpenSearch/pull/16489))
20+
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).
2021

2122
### Dependencies
2223
- Bump `com.azure:azure-storage-common` from 12.25.1 to 12.27.1 ([#16521](https://github.com/opensearch-project/OpenSearch/pull/16521))

modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java

+26-3
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@
146146
import org.opensearch.index.analysis.PreConfiguredTokenizer;
147147
import org.opensearch.index.analysis.TokenFilterFactory;
148148
import org.opensearch.index.analysis.TokenizerFactory;
149+
import org.opensearch.indices.analysis.AnalysisModule;
149150
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
150151
import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
151152
import org.opensearch.plugins.AnalysisPlugin;
@@ -247,7 +248,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
247248
}
248249

249250
@Override
250-
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
251+
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
251252
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
252253
filters.put("apostrophe", ApostropheFilterFactory::new);
253254
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
@@ -332,14 +333,36 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
332333
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
333334
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
334335
filters.put("stemmer", StemmerTokenFilterFactory::new);
335-
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
336-
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
337336
filters.put("trim", TrimTokenFilterFactory::new);
338337
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
339338
filters.put("unique", UniqueTokenFilterFactory::new);
340339
filters.put("uppercase", UpperCaseTokenFilterFactory::new);
341340
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
342341
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
342+
filters.put(
343+
"synonym",
344+
requiresAnalysisSettings(
345+
(indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory(
346+
indexSettings,
347+
environment,
348+
name,
349+
settings,
350+
analysisModule.getAnalysisRegistry()
351+
)
352+
)
353+
);
354+
filters.put(
355+
"synonym_graph",
356+
requiresAnalysisSettings(
357+
(indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory(
358+
indexSettings,
359+
environment,
360+
name,
361+
settings,
362+
analysisModule.getAnalysisRegistry()
363+
)
364+
)
365+
);
343366
return filters;
344367
}
345368

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java

+9-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.opensearch.env.Environment;
4141
import org.opensearch.index.IndexSettings;
4242
import org.opensearch.index.analysis.AnalysisMode;
43+
import org.opensearch.index.analysis.AnalysisRegistry;
4344
import org.opensearch.index.analysis.CharFilterFactory;
4445
import org.opensearch.index.analysis.TokenFilterFactory;
4546
import org.opensearch.index.analysis.TokenizerFactory;
@@ -49,8 +50,14 @@
4950

5051
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
5152

52-
SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
53-
super(indexSettings, env, name, settings);
53+
SynonymGraphTokenFilterFactory(
54+
IndexSettings indexSettings,
55+
Environment env,
56+
String name,
57+
Settings settings,
58+
AnalysisRegistry analysisRegistry
59+
) {
60+
super(indexSettings, env, name, settings, analysisRegistry);
5461
}
5562

5663
@Override

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java

+25-3
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@
4444
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
4545
import org.opensearch.index.analysis.Analysis;
4646
import org.opensearch.index.analysis.AnalysisMode;
47+
import org.opensearch.index.analysis.AnalysisRegistry;
4748
import org.opensearch.index.analysis.CharFilterFactory;
4849
import org.opensearch.index.analysis.CustomAnalyzer;
4950
import org.opensearch.index.analysis.TokenFilterFactory;
5051
import org.opensearch.index.analysis.TokenizerFactory;
5152

53+
import java.io.IOException;
5254
import java.io.Reader;
5355
import java.io.StringReader;
5456
import java.util.List;
@@ -64,8 +66,16 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
6466
protected final Settings settings;
6567
protected final Environment environment;
6668
protected final AnalysisMode analysisMode;
67-
68-
SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
69+
private final String synonymAnalyzerName;
70+
private final AnalysisRegistry analysisRegistry;
71+
72+
SynonymTokenFilterFactory(
73+
IndexSettings indexSettings,
74+
Environment env,
75+
String name,
76+
Settings settings,
77+
AnalysisRegistry analysisRegistry
78+
) {
6979
super(indexSettings, name, settings);
7080
this.settings = settings;
7181

@@ -83,6 +93,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
8393
boolean updateable = settings.getAsBoolean("updateable", false);
8494
this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
8595
this.environment = env;
96+
this.synonymAnalyzerName = settings.get("synonym_analyzer", null);
97+
this.analysisRegistry = analysisRegistry;
8698
}
8799

88100
@Override
@@ -137,6 +149,17 @@ Analyzer buildSynonymAnalyzer(
137149
List<TokenFilterFactory> tokenFilters,
138150
Function<String, TokenFilterFactory> allFilters
139151
) {
152+
if (synonymAnalyzerName != null) {
153+
Analyzer customSynonymAnalyzer;
154+
try {
155+
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
156+
} catch (IOException e) {
157+
throw new RuntimeException(e);
158+
}
159+
if (customSynonymAnalyzer != null) {
160+
return customSynonymAnalyzer;
161+
}
162+
}
140163
return new CustomAnalyzer(
141164
tokenizer,
142165
charFilters.toArray(new CharFilterFactory[0]),
@@ -177,5 +200,4 @@ Reader getRulesFromSettings(Environment env) {
177200
}
178201
return rulesReader;
179202
}
180-
181203
}

modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java

+22
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@
3939
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
4040
import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory;
4141
import org.apache.lucene.analysis.te.TeluguStemFilterFactory;
42+
import org.opensearch.index.analysis.TokenFilterFactory;
4243
import org.opensearch.indices.analysis.AnalysisFactoryTestCase;
44+
import org.opensearch.indices.analysis.AnalysisModule;
4345

4446
import java.util.List;
4547
import java.util.Map;
4648
import java.util.TreeMap;
4749

50+
import org.mockito.Mock;
51+
4852
import static java.util.Collections.emptyList;
4953
import static java.util.stream.Collectors.toList;
5054

@@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() {
5357
super(new CommonAnalysisModulePlugin());
5458
}
5559

60+
@Mock
61+
private AnalysisModule analysisModule;
62+
5663
@Override
5764
protected Map<String, Class<?>> getTokenizers() {
5865
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
@@ -302,4 +309,19 @@ private void markedTestCase(String name, Map<String, Class<?>> map) {
302309
unmarked
303310
);
304311
}
312+
313+
/**
314+
* Tests the getTokenFilters(AnalysisModule) method to verify:
315+
* 1. All token filters are properly loaded
316+
* 2. Basic filters remain available
317+
* 3. Synonym filters remain available when AnalysisModule is provided
318+
*/
319+
public void testGetTokenFiltersWithAnalysisModule() {
320+
CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin();
321+
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(analysisModule);
322+
assertNotNull("Token filters should not be null", filters);
323+
assertTrue("Should contain basic filters", filters.containsKey("lowercase"));
324+
assertTrue("Should contain synonym filter", filters.containsKey("synonym"));
325+
assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph"));
326+
}
305327
}

modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java

+88-7
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,14 @@
4141
import org.opensearch.cluster.metadata.IndexMetadata;
4242
import org.opensearch.common.settings.Settings;
4343
import org.opensearch.env.Environment;
44+
import org.opensearch.env.TestEnvironment;
4445
import org.opensearch.index.IndexSettings;
46+
import org.opensearch.index.analysis.AnalysisRegistry;
4547
import org.opensearch.index.analysis.IndexAnalyzers;
4648
import org.opensearch.index.analysis.PreConfiguredTokenFilter;
4749
import org.opensearch.index.analysis.TokenFilterFactory;
4850
import org.opensearch.index.analysis.TokenizerFactory;
51+
import org.opensearch.indices.analysis.AnalysisModule;
4952
import org.opensearch.test.IndexSettingsModule;
5053
import org.opensearch.test.OpenSearchTestCase;
5154
import org.opensearch.test.VersionUtils;
@@ -63,6 +66,7 @@
6366
import static org.hamcrest.Matchers.equalTo;
6467
import static org.hamcrest.Matchers.instanceOf;
6568
import static org.hamcrest.Matchers.startsWith;
69+
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
6670

6771
public class SynonymsAnalysisTests extends OpenSearchTestCase {
6872
private IndexAnalyzers indexAnalyzers;
@@ -255,14 +259,16 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
255259
.put("hyphenation_patterns_path", "foo")
256260
.build();
257261
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
258-
262+
Environment environment = TestEnvironment.newEnvironment(settings);
263+
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
264+
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
259265
String[] bypassingFactories = new String[] { "dictionary_decompounder" };
260266

261267
CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin();
262268
for (String factory : bypassingFactories) {
263-
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
264-
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
265-
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
269+
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
270+
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
271+
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
266272
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
267273

268274
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
@@ -319,7 +325,11 @@ public void testDisallowedTokenFilters() throws IOException {
319325
.putList("common_words", "a", "b")
320326
.put("output_unigrams", "true")
321327
.build();
328+
329+
Environment environment = TestEnvironment.newEnvironment(settings);
322330
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
331+
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
332+
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
323333
CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin();
324334

325335
String[] disallowedFactories = new String[] {
@@ -333,9 +343,9 @@ public void testDisallowedTokenFilters() throws IOException {
333343
"fingerprint" };
334344

335345
for (String factory : disallowedFactories) {
336-
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
337-
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
338-
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
346+
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
347+
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
348+
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
339349

340350
IllegalArgumentException e = expectThrows(
341351
IllegalArgumentException.class,
@@ -362,4 +372,75 @@ private void match(String analyzerName, String source, String target) throws IOE
362372
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
363373
}
364374

375+
/**
376+
* Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
377+
* This test verifies the correct handling of:
378+
* 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
379+
* 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
380+
* 3. Single word synonyms (e.g., "laptop" → ["notebook"])
381+
*
382+
* @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
383+
*/
384+
public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
385+
Settings settings = Settings.builder()
386+
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
387+
.put("path.home", createTempDir().toString())
388+
.put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph")
389+
.put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true)
390+
.put("index.analysis.filter.custom_word_delimiter.catenate_all", true)
391+
.put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false)
392+
.put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false)
393+
.put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace")
394+
.put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)")
395+
.put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ")
396+
.put("index.analysis.filter.custom_pattern_replace_filter.all", true)
397+
.put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph")
398+
.putList(
399+
"index.analysis.filter.custom_synonym_graph_filter.synonyms",
400+
"laptop => notebook",
401+
"smartphone, mobile phone, cell phone => smartphone",
402+
"tv, television => television"
403+
)
404+
.put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard")
405+
.put("index.analysis.analyzer.text_en_index.type", "custom")
406+
.put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace")
407+
.putList(
408+
"index.analysis.analyzer.text_en_index.filter",
409+
"lowercase",
410+
"custom_word_delimiter",
411+
"custom_synonym_graph_filter",
412+
"custom_pattern_replace_filter",
413+
"flatten_graph"
414+
)
415+
.build();
416+
Environment environment = TestEnvironment.newEnvironment(settings);
417+
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
418+
AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
419+
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings);
420+
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) {
421+
assertTokenStreamContents(
422+
ts,
423+
new String[] { "notebook", "note", "book" },
424+
new int[] { 0, 0, 5 },
425+
new int[] { 9, 4, 9 },
426+
new String[] { "word", "word", "word" },
427+
new int[] { 1, 0, 1 },
428+
new int[] { 2, 1, 1 }
429+
);
430+
}
431+
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) {
432+
assertTokenStreamContents(
433+
ts,
434+
new String[] { "smartphone" },
435+
new int[] { 0 },
436+
new int[] { 12 },
437+
new String[] { "SYNONYM" },
438+
new int[] { 1 },
439+
new int[] { 1 }
440+
);
441+
}
442+
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) {
443+
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
444+
}
445+
}
365446
}

server/src/main/java/org/opensearch/indices/analysis/AnalysisModule.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,12 @@ public boolean requiresAnalysisSettings() {
165165
)
166166
);
167167

168-
tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
168+
for (AnalysisPlugin plugin : plugins) {
169+
Map<String, AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(this);
170+
for (Map.Entry<String, AnalysisProvider<TokenFilterFactory>> entry : filters.entrySet()) {
171+
tokenFilters.register(entry.getKey(), entry.getValue());
172+
}
173+
}
169174
return tokenFilters;
170175
}
171176

server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java

+9
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import org.opensearch.index.analysis.PreConfiguredTokenizer;
4848
import org.opensearch.index.analysis.TokenFilterFactory;
4949
import org.opensearch.index.analysis.TokenizerFactory;
50+
import org.opensearch.indices.analysis.AnalysisModule;
5051
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
5152

5253
import java.io.IOException;
@@ -84,6 +85,14 @@ default Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
8485
return emptyMap();
8586
}
8687

88+
/**
89+
* Override to add additional {@link TokenFilter}s that need access to the AnalysisModule.
90+
* The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method.
91+
*/
92+
default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
93+
return getTokenFilters();
94+
}
95+
8796
/**
8897
* Override to add additional {@link TokenFilter}s. See {@link #requiresAnalysisSettings(AnalysisProvider)}
8998
* how to on get the configuration from the index.

0 commit comments

Comments
 (0)