Skip to content

Commit 11836d0

Browse files
authored
Add kuromoji_completion analyzer and filter (opensearch-project#4835) (opensearch-project#12287)
* Add kuromoji_completion analyzer and filter (opensearch-project#4835) Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> * Use INDEX mode if an invalid value is set for mode in the kuromoji_completion filter Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> --------- Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com>
1 parent ca827dd commit 11836d0

File tree

8 files changed

+200
-1
lines changed

8 files changed

+200
-1
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
105105
- Add toString methods to MultiSearchRequest, MultiGetRequest and CreateIndexRequest ([#12163](https://github.com/opensearch-project/OpenSearch/pull/12163))
106106
- Support for returning scores in matched queries ([#11626](https://github.com/opensearch-project/OpenSearch/pull/11626))
107107
- Add shard id property to SearchLookup for use in field types provided by plugins ([#1063](https://github.com/opensearch-project/OpenSearch/pull/1063))
108+
- Add kuromoji_completion analyzer and filter ([#4835](https://github.com/opensearch-project/OpenSearch/issues/4835))
108109

109110
### Dependencies
110111
- Bump `peter-evans/find-comment` from 2 to 3 ([#12288](https://github.com/opensearch-project/OpenSearch/pull/12288))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.index.analysis;
10+
11+
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
12+
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
13+
import org.apache.lucene.analysis.ja.dict.UserDictionary;
14+
import org.opensearch.common.settings.Settings;
15+
import org.opensearch.env.Environment;
16+
import org.opensearch.index.IndexSettings;
17+
18+
public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseCompletionAnalyzer> {
19+
20+
private final JapaneseCompletionAnalyzer analyzer;
21+
22+
public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
23+
super(indexSettings, name, settings);
24+
final JapaneseCompletionFilter.Mode mode = KuromojiCompletionFilterFactory.getMode(settings);
25+
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
26+
analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode);
27+
}
28+
29+
@Override
30+
public JapaneseCompletionAnalyzer get() {
31+
return this.analyzer;
32+
}
33+
34+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.index.analysis;
10+
11+
import org.apache.lucene.analysis.TokenStream;
12+
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
13+
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
14+
import org.opensearch.common.settings.Settings;
15+
import org.opensearch.env.Environment;
16+
import org.opensearch.index.IndexSettings;
17+
18+
public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory {
19+
private final Mode mode;
20+
21+
public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
22+
super(indexSettings, name, settings);
23+
this.mode = getMode(settings);
24+
}
25+
26+
public static Mode getMode(Settings settings) {
27+
String modeSetting = settings.get("mode", null);
28+
if (modeSetting != null) {
29+
if ("index".equalsIgnoreCase(modeSetting)) {
30+
return Mode.INDEX;
31+
} else if ("query".equalsIgnoreCase(modeSetting)) {
32+
return Mode.QUERY;
33+
}
34+
}
35+
return Mode.INDEX;
36+
}
37+
38+
@Override
39+
public TokenStream create(TokenStream tokenStream) {
40+
return new JapaneseCompletionFilter(tokenStream, mode);
41+
}
42+
}

plugins/analysis-kuromoji/src/main/java/org/opensearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
import org.opensearch.index.analysis.JapaneseStopTokenFilterFactory;
3939
import org.opensearch.index.analysis.KuromojiAnalyzerProvider;
4040
import org.opensearch.index.analysis.KuromojiBaseFormFilterFactory;
41+
import org.opensearch.index.analysis.KuromojiCompletionAnalyzerProvider;
42+
import org.opensearch.index.analysis.KuromojiCompletionFilterFactory;
4143
import org.opensearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
4244
import org.opensearch.index.analysis.KuromojiKatakanaStemmerFactory;
4345
import org.opensearch.index.analysis.KuromojiNumberFilterFactory;
@@ -70,6 +72,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
7072
extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
7173
extra.put("ja_stop", JapaneseStopTokenFilterFactory::new);
7274
extra.put("kuromoji_number", KuromojiNumberFilterFactory::new);
75+
extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new);
7376
return extra;
7477
}
7578

@@ -80,6 +83,9 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
8083

8184
@Override
8285
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
83-
return singletonMap("kuromoji", KuromojiAnalyzerProvider::new);
86+
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
87+
extra.put("kuromoji", KuromojiAnalyzerProvider::new);
88+
extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new);
89+
return extra;
8490
}
8591
}

plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/AnalysisKuromojiFactoryTests.java

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ protected Map<String, Class<?>> getTokenFilters() {
5959
filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class);
6060
filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class);
6161
filters.put("japanesenumber", KuromojiNumberFilterFactory.class);
62+
filters.put("japanesecompletion", KuromojiCompletionFilterFactory.class);
6263
return filters;
6364
}
6465

plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java

+69
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.lucene.analysis.TokenStream;
3737
import org.apache.lucene.analysis.Tokenizer;
3838
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
39+
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
3940
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
4041
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
4142
import org.opensearch.Version;
@@ -85,6 +86,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
8586
filterFactory = analysis.tokenFilter.get("kuromoji_number");
8687
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
8788

89+
filterFactory = analysis.tokenFilter.get("kuromoji_completion");
90+
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));
91+
92+
filterFactory = analysis.tokenFilter.get("kuromoji_completion_index");
93+
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));
94+
95+
filterFactory = analysis.tokenFilter.get("kuromoji_completion_query");
96+
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));
97+
8898
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
8999
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
90100
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
@@ -93,6 +103,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
93103
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
94104
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
95105

106+
analyzer = indexAnalyzers.get("kuromoji_completion");
107+
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));
108+
109+
analyzer = indexAnalyzers.get("kuromoji_completion_index");
110+
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));
111+
112+
analyzer = indexAnalyzers.get("kuromoji_completion_query");
113+
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));
114+
96115
CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark");
97116
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
98117

@@ -199,6 +218,32 @@ public void testKatakanaStemFilter() throws IOException {
199218
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
200219
}
201220

221+
public void testJapaneseCompletionFilter() throws IOException {
222+
TestAnalysis analysis = createTestAnalysis();
223+
224+
String source = "寿司がおいしいね";
225+
String[] expected_tokens = new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" };
226+
227+
// mode = INDEX(default)
228+
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
229+
tokenizer.setReader(new StringReader(source));
230+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion");
231+
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
232+
233+
// mode = INDEX
234+
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
235+
tokenizer.setReader(new StringReader(source));
236+
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index");
237+
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
238+
239+
// mode = QUERY
240+
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
241+
tokenizer.setReader(new StringReader(source));
242+
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query");
243+
expected_tokens = new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" };
244+
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
245+
}
246+
202247
public void testIterationMarkCharFilter() throws IOException {
203248
TestAnalysis analysis = createTestAnalysis();
204249
// test only kanji
@@ -414,6 +459,30 @@ public void testDiscardCompoundToken() throws Exception {
414459
assertSimpleTSOutput(tokenizer, expected);
415460
}
416461

462+
public void testJapaneseCompletionAnalyzer() throws Exception {
463+
TestAnalysis analysis = createTestAnalysis();
464+
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
465+
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji_completion");
466+
467+
// mode = INDEX(default)
468+
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
469+
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
470+
}
471+
472+
// mode = INDEX
473+
analyzer = indexAnalyzers.get("kuromoji_completion_index");
474+
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
475+
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
476+
}
477+
478+
// mode = QUERY
479+
analyzer = indexAnalyzers.get("kuromoji_completion_query");
480+
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
481+
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" });
482+
}
483+
484+
}
485+
417486
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
418487
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
419488
Path home = createTempDir();

plugins/analysis-kuromoji/src/test/resources/org/opensearch/index/analysis/kuromoji_analysis.json

+16
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@
1717
"ja_stop" : {
1818
"type": "ja_stop",
1919
"stopwords": ["_japanese_", "スピード"]
20+
},
21+
"kuromoji_completion_index" : {
22+
"type" : "kuromoji_completion",
23+
"mode" : "index"
24+
},
25+
"kuromoji_completion_query" : {
26+
"type" : "kuromoji_completion",
27+
"mode" : "query"
2028
}
2129
},
2230

@@ -70,6 +78,14 @@
7078
"my_analyzer" : {
7179
"type" : "custom",
7280
"tokenizer" : "kuromoji_tokenizer"
81+
},
82+
"kuromoji_completion_index" : {
83+
"type" : "kuromoji_completion",
84+
"mode" : "index"
85+
},
86+
"kuromoji_completion_query" : {
87+
"type" : "kuromoji_completion",
88+
"mode" : "query"
7389
}
7490
}
7591

plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml

+30
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,24 @@
1616
- match: { tokens.5.token: 飲む }
1717
- match: { tokens.6.token: 行く }
1818
---
19+
"Completion Analyzer":
20+
- do:
21+
indices.analyze:
22+
body:
23+
text: 寿司がおいしいね
24+
analyzer: kuromoji_completion
25+
- length: { tokens: 10 }
26+
- match: { tokens.0.token: "寿司" }
27+
- match: { tokens.1.token: "susi" }
28+
- match: { tokens.2.token: "sushi" }
29+
- match: { tokens.3.token: "が" }
30+
- match: { tokens.4.token: "ga" }
31+
- match: { tokens.5.token: "おいしい" }
32+
- match: { tokens.6.token: "oisii" }
33+
- match: { tokens.7.token: "oishii" }
34+
- match: { tokens.8.token: "ね" }
35+
- match: { tokens.9.token: "ne" }
36+
---
1937
"Tokenizer":
2038
- do:
2139
indices.analyze:
@@ -57,3 +75,15 @@
5775
filter: [kuromoji_stemmer]
5876
- length: { tokens: 1 }
5977
- match: { tokens.0.token: サーバ }
78+
---
79+
"Completion filter":
80+
- do:
81+
indices.analyze:
82+
body:
83+
text: 寿司
84+
tokenizer: kuromoji_tokenizer
85+
filter: [kuromoji_completion]
86+
- length: { tokens: 3 }
87+
- match: { tokens.0.token: "寿司" }
88+
- match: { tokens.1.token: "susi" }
89+
- match: { tokens.2.token: "sushi" }

0 commit comments

Comments
 (0)