41
41
import org .opensearch .cluster .metadata .IndexMetadata ;
42
42
import org .opensearch .common .settings .Settings ;
43
43
import org .opensearch .env .Environment ;
44
+ import org .opensearch .env .TestEnvironment ;
44
45
import org .opensearch .index .IndexSettings ;
46
+ import org .opensearch .index .analysis .AnalysisRegistry ;
45
47
import org .opensearch .index .analysis .IndexAnalyzers ;
46
48
import org .opensearch .index .analysis .PreConfiguredTokenFilter ;
47
49
import org .opensearch .index .analysis .TokenFilterFactory ;
48
50
import org .opensearch .index .analysis .TokenizerFactory ;
51
+ import org .opensearch .indices .analysis .AnalysisModule ;
49
52
import org .opensearch .test .IndexSettingsModule ;
50
53
import org .opensearch .test .OpenSearchTestCase ;
51
54
import org .opensearch .test .VersionUtils ;
63
66
import static org .hamcrest .Matchers .equalTo ;
64
67
import static org .hamcrest .Matchers .instanceOf ;
65
68
import static org .hamcrest .Matchers .startsWith ;
69
+ import static org .apache .lucene .tests .analysis .BaseTokenStreamTestCase .assertTokenStreamContents ;
66
70
67
71
public class SynonymsAnalysisTests extends OpenSearchTestCase {
68
72
private IndexAnalyzers indexAnalyzers ;
@@ -255,14 +259,16 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
255
259
.put ("hyphenation_patterns_path" , "foo" )
256
260
.build ();
257
261
IndexSettings idxSettings = IndexSettingsModule .newIndexSettings ("index" , settings );
258
-
262
+ Environment environment = TestEnvironment .newEnvironment (settings );
263
+ AnalysisModule analysisModule = new AnalysisModule (environment , Collections .singletonList (new CommonAnalysisModulePlugin ()));
264
+ AnalysisRegistry analysisRegistry = analysisModule .getAnalysisRegistry ();
259
265
String [] bypassingFactories = new String [] { "dictionary_decompounder" };
260
266
261
267
CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin ();
262
268
for (String factory : bypassingFactories ) {
263
- TokenFilterFactory tff = plugin .getTokenFilters ().get (factory ).get (idxSettings , null , factory , settings );
264
- TokenizerFactory tok = new KeywordTokenizerFactory (idxSettings , null , "keyword" , settings );
265
- SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory (idxSettings , null , "synonym" , settings );
269
+ TokenFilterFactory tff = plugin .getTokenFilters (analysisModule ).get (factory ).get (idxSettings , environment , factory , settings );
270
+ TokenizerFactory tok = new KeywordTokenizerFactory (idxSettings , environment , "keyword" , settings );
271
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory (idxSettings , environment , "synonym" , settings , analysisRegistry );
266
272
Analyzer analyzer = stff .buildSynonymAnalyzer (tok , Collections .emptyList (), Collections .singletonList (tff ), null );
267
273
268
274
try (TokenStream ts = analyzer .tokenStream ("field" , "text" )) {
@@ -319,7 +325,11 @@ public void testDisallowedTokenFilters() throws IOException {
319
325
.putList ("common_words" , "a" , "b" )
320
326
.put ("output_unigrams" , "true" )
321
327
.build ();
328
+
329
+ Environment environment = TestEnvironment .newEnvironment (settings );
322
330
IndexSettings idxSettings = IndexSettingsModule .newIndexSettings ("index" , settings );
331
+ AnalysisModule analysisModule = new AnalysisModule (environment , Collections .singletonList (new CommonAnalysisModulePlugin ()));
332
+ AnalysisRegistry analysisRegistry = analysisModule .getAnalysisRegistry ();
323
333
CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin ();
324
334
325
335
String [] disallowedFactories = new String [] {
@@ -333,9 +343,9 @@ public void testDisallowedTokenFilters() throws IOException {
333
343
"fingerprint" };
334
344
335
345
for (String factory : disallowedFactories ) {
336
- TokenFilterFactory tff = plugin .getTokenFilters ().get (factory ).get (idxSettings , null , factory , settings );
337
- TokenizerFactory tok = new KeywordTokenizerFactory (idxSettings , null , "keyword" , settings );
338
- SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory (idxSettings , null , "synonym" , settings );
346
+ TokenFilterFactory tff = plugin .getTokenFilters (analysisModule ).get (factory ).get (idxSettings , environment , factory , settings );
347
+ TokenizerFactory tok = new KeywordTokenizerFactory (idxSettings , environment , "keyword" , settings );
348
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory (idxSettings , environment , "synonym" , settings , analysisRegistry );
339
349
340
350
IllegalArgumentException e = expectThrows (
341
351
IllegalArgumentException .class ,
@@ -362,4 +372,75 @@ private void match(String analyzerName, String source, String target) throws IOE
362
372
MatcherAssert .assertThat (target , equalTo (sb .toString ().trim ()));
363
373
}
364
374
375
+ /**
376
+ * Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
377
+ * This test verifies the correct handling of:
378
+ * 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
379
+ * 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
380
+ * 3. Single word synonyms (e.g., "laptop" → ["notebook"])
381
+ *
382
+ * @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
383
+ */
384
+ public void testSynonymAnalyzerWithWordDelimiter () throws IOException {
385
+ Settings settings = Settings .builder ()
386
+ .put (IndexMetadata .SETTING_VERSION_CREATED , Version .CURRENT )
387
+ .put ("path.home" , createTempDir ().toString ())
388
+ .put ("index.analysis.filter.custom_word_delimiter.type" , "word_delimiter_graph" )
389
+ .put ("index.analysis.filter.custom_word_delimiter.generate_word_parts" , true )
390
+ .put ("index.analysis.filter.custom_word_delimiter.catenate_all" , true )
391
+ .put ("index.analysis.filter.custom_word_delimiter.split_on_numerics" , false )
392
+ .put ("index.analysis.filter.custom_word_delimiter.split_on_case_change" , false )
393
+ .put ("index.analysis.filter.custom_pattern_replace_filter.type" , "pattern_replace" )
394
+ .put ("index.analysis.filter.custom_pattern_replace_filter.pattern" , "(-)" )
395
+ .put ("index.analysis.filter.custom_pattern_replace_filter.replacement" , " " )
396
+ .put ("index.analysis.filter.custom_pattern_replace_filter.all" , true )
397
+ .put ("index.analysis.filter.custom_synonym_graph_filter.type" , "synonym_graph" )
398
+ .putList (
399
+ "index.analysis.filter.custom_synonym_graph_filter.synonyms" ,
400
+ "laptop => notebook" ,
401
+ "smartphone, mobile phone, cell phone => smartphone" ,
402
+ "tv, television => television"
403
+ )
404
+ .put ("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer" , "standard" )
405
+ .put ("index.analysis.analyzer.text_en_index.type" , "custom" )
406
+ .put ("index.analysis.analyzer.text_en_index.tokenizer" , "whitespace" )
407
+ .putList (
408
+ "index.analysis.analyzer.text_en_index.filter" ,
409
+ "lowercase" ,
410
+ "custom_word_delimiter" ,
411
+ "custom_synonym_graph_filter" ,
412
+ "custom_pattern_replace_filter" ,
413
+ "flatten_graph"
414
+ )
415
+ .build ();
416
+ Environment environment = TestEnvironment .newEnvironment (settings );
417
+ IndexSettings indexSettings = IndexSettingsModule .newIndexSettings ("test" , settings );
418
+ AnalysisModule module = new AnalysisModule (environment , Collections .singletonList (new CommonAnalysisModulePlugin ()));
419
+ IndexAnalyzers analyzers = module .getAnalysisRegistry ().build (indexSettings );
420
+ try (TokenStream ts = analyzers .get ("text_en_index" ).tokenStream ("" , "note-book" )) {
421
+ assertTokenStreamContents (
422
+ ts ,
423
+ new String [] { "notebook" , "note" , "book" },
424
+ new int [] { 0 , 0 , 5 },
425
+ new int [] { 9 , 4 , 9 },
426
+ new String [] { "word" , "word" , "word" },
427
+ new int [] { 1 , 0 , 1 },
428
+ new int [] { 2 , 1 , 1 }
429
+ );
430
+ }
431
+ try (TokenStream ts = analyzers .get ("text_en_index" ).tokenStream ("" , "mobile phone" )) {
432
+ assertTokenStreamContents (
433
+ ts ,
434
+ new String [] { "smartphone" },
435
+ new int [] { 0 },
436
+ new int [] { 12 },
437
+ new String [] { "SYNONYM" },
438
+ new int [] { 1 },
439
+ new int [] { 1 }
440
+ );
441
+ }
442
+ try (TokenStream ts = analyzers .get ("text_en_index" ).tokenStream ("" , "laptop" )) {
443
+ assertTokenStreamContents (ts , new String [] { "notebook" }, new int [] { 0 }, new int [] { 6 });
444
+ }
445
+ }
365
446
}
0 commit comments