Skip to content

Commit a44370d

Browse files
authored
Better plural stemmer than minimal_english (#4738) (#4834)
Drops the trailing "e" in taxes, dresses, watches, dishes etc that otherwise cause mismatches with plural and singular forms. Signed-off-by: Nicholas Walter Knize <nknize@apache.org> Co-authored-by: Mark Harwood <markharwood@gmail.com> Co-authored-by: Nicholas Walter Knize <nknize@apache.org> (cherry picked from commit c92846d)
1 parent 45fb2e0 commit a44370d

File tree

4 files changed

+262
-0
lines changed

4 files changed

+262
-0
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
9494
- [Segment Replication] Update replicas to commit SegmentInfos instead of relying on segments_N from primary shards ([#4450](https://github.com/opensearch-project/OpenSearch/pull/4450))
9595
- [Segment Replication] Adding check to make sure checkpoint is not processed when a shard's shard routing is primary ([#4716](https://github.com/opensearch-project/OpenSearch/pull/4716))
9696
- Disable merge on refresh in DiskThresholdDeciderIT ([#4828](https://github.com/opensearch-project/OpenSearch/pull/4828))
97+
- Better plural stemmer than minimal_english ([#4738](https://github.com/opensearch-project/OpenSearch/pull/4738))
9798

9899
### Security
99100

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
/*
10+
* Licensed to Elasticsearch under one or more contributor
11+
* license agreements. See the NOTICE file distributed with
12+
* this work for additional information regarding copyright
13+
* ownership. Elasticsearch licenses this file to you under
14+
* the Apache License, Version 2.0 (the "License"); you may
15+
* not use this file except in compliance with the License.
16+
* You may obtain a copy of the License at
17+
*
18+
* http://www.apache.org/licenses/LICENSE-2.0
19+
*
20+
* Unless required by applicable law or agreed to in writing,
21+
* software distributed under the License is distributed on an
22+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23+
* KIND, either express or implied. See the License for the
24+
* specific language governing permissions and limitations
25+
* under the License.
26+
*/
27+
28+
/*
29+
* Modifications Copyright OpenSearch Contributors. See
30+
* GitHub history for details.
31+
*/
32+
33+
package org.opensearch.analysis.common;
34+
35+
import org.apache.lucene.analysis.TokenFilter;
36+
import org.apache.lucene.analysis.TokenStream;
37+
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
38+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
39+
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
40+
41+
import java.io.IOException;
42+
43+
public final class EnglishPluralStemFilter extends TokenFilter {
44+
private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer();
45+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
46+
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
47+
48+
public EnglishPluralStemFilter(TokenStream input) {
49+
super(input);
50+
}
51+
52+
@Override
53+
public boolean incrementToken() throws IOException {
54+
if (input.incrementToken()) {
55+
if (!keywordAttr.isKeyword()) {
56+
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
57+
termAtt.setLength(newlen);
58+
}
59+
return true;
60+
} else {
61+
return false;
62+
}
63+
}
64+
65+
/**
66+
* Plural stemmer for English based on the {@link EnglishMinimalStemFilter}
67+
* <p>
68+
* This stemmer removes plurals but beyond EnglishMinimalStemFilter adds
69+
* four new suffix rules to remove dangling e characters:
70+
* <ul>
71+
* <li>xes - "boxes" becomes "box"</li>
72+
* <li>sses - "dresses" becomes "dress"</li>
73+
* <li>shes - "dishes" becomes "dish"</li>
74+
* <li>tches - "watches" becomes "watch"</li>
75+
* </ul>
76+
* See https://github.com/elastic/elasticsearch/issues/42892
77+
* <p>
78+
* In addition the s stemmer logic is amended so that
79+
* <ul>
80+
* <li>ees-&gt;ee so that bees matches bee</li>
81+
* <li>ies-&gt;y only on longer words to that ties matches tie</li>
82+
* <li>oes-&gt;o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe</li>
83+
* </ul>
84+
*/
85+
public static class EnglishPluralStemmer {
86+
87+
// Words ending in oes that retain the e when stemmed
88+
public static final char[][] oesExceptions = { "shoes".toCharArray(), "canoes".toCharArray(), "oboes".toCharArray() };
89+
// Words ending in ches that retain the e when stemmed
90+
public static final char[][] chesExceptions = {
91+
"cliches".toCharArray(),
92+
"avalanches".toCharArray(),
93+
"mustaches".toCharArray(),
94+
"moustaches".toCharArray(),
95+
"quiches".toCharArray(),
96+
"headaches".toCharArray(),
97+
"heartaches".toCharArray(),
98+
"porsches".toCharArray(),
99+
"tranches".toCharArray(),
100+
"caches".toCharArray() };
101+
102+
@SuppressWarnings("fallthrough")
103+
public int stem(char s[], int len) {
104+
if (len < 3 || s[len - 1] != 's') return len;
105+
106+
switch (s[len - 2]) {
107+
case 'u':
108+
case 's':
109+
return len;
110+
case 'e':
111+
// Modified ies->y logic from original s-stemmer - only work on strings > 4
112+
// so spies -> spy still but pies->pie.
113+
// The original code also special-cased aies and eies for no good reason as far as I can tell.
114+
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
115+
if (len > 4 && s[len - 3] == 'i') {
116+
s[len - 3] = 'y';
117+
return len - 2;
118+
}
119+
120+
// Suffix rules to remove any dangling "e"
121+
if (len > 3) {
122+
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
123+
if (len > 4 && s[len - 3] == 'x') {
124+
return len - 2;
125+
}
126+
// oes
127+
if (len > 3 && s[len - 3] == 'o') {
128+
if (isException(s, len, oesExceptions)) {
129+
// Only remove the S
130+
return len - 1;
131+
}
132+
// Remove the es
133+
return len - 2;
134+
}
135+
if (len > 4) {
136+
// shes/sses
137+
if (s[len - 4] == 's' && (s[len - 3] == 'h' || s[len - 3] == 's')) {
138+
return len - 2;
139+
}
140+
141+
// ches
142+
if (len > 4) {
143+
if (s[len - 4] == 'c' && s[len - 3] == 'h') {
144+
if (isException(s, len, chesExceptions)) {
145+
// Only remove the S
146+
return len - 1;
147+
}
148+
// Remove the es
149+
return len - 2;
150+
151+
}
152+
}
153+
}
154+
}
155+
156+
default:
157+
return len - 1;
158+
}
159+
}
160+
161+
private boolean isException(char[] s, int len, char[][] exceptionsList) {
162+
for (char[] oesRule : exceptionsList) {
163+
int rulePos = oesRule.length - 1;
164+
int sPos = len - 1;
165+
boolean matched = true;
166+
while (rulePos >= 0 && sPos >= 0) {
167+
if (oesRule[rulePos] != s[sPos]) {
168+
matched = false;
169+
break;
170+
}
171+
rulePos--;
172+
sPos--;
173+
}
174+
if (matched) {
175+
return true;
176+
}
177+
}
178+
return false;
179+
}
180+
}
181+
182+
}

modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java

+2
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ public TokenStream create(TokenStream tokenStream) {
154154
return new SnowballFilter(tokenStream, new EnglishStemmer());
155155
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
156156
return new EnglishMinimalStemFilter(tokenStream);
157+
} else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) {
158+
return new EnglishPluralStemFilter(tokenStream);
157159
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
158160
return new EnglishPossessiveFilter(tokenStream);
159161

modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerTokenFilterFactoryTests.java

+77
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,83 @@ public void testPorter2FilterFactory() throws IOException {
111111
}
112112
}
113113

114+
public void testEnglishPluralFilter() throws IOException {
115+
int iters = scaledRandomIntBetween(20, 100);
116+
for (int i = 0; i < iters; i++) {
117+
118+
Version v = VersionUtils.randomVersion(random());
119+
Settings settings = Settings.builder()
120+
.put("index.analysis.filter.my_plurals.type", "stemmer")
121+
.put("index.analysis.filter.my_plurals.language", "plural_english")
122+
.put("index.analysis.analyzer.my_plurals.tokenizer", "whitespace")
123+
.put("index.analysis.analyzer.my_plurals.filter", "my_plurals")
124+
.put(SETTING_VERSION_CREATED, v)
125+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
126+
.build();
127+
128+
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
129+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals");
130+
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
131+
Tokenizer tokenizer = new WhitespaceTokenizer();
132+
tokenizer.setReader(new StringReader("dresses"));
133+
TokenStream create = tokenFilter.create(tokenizer);
134+
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
135+
NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
136+
assertThat(create, instanceOf(EnglishPluralStemFilter.class));
137+
138+
// Check old EnglishMinimalStemmer ("S" stemmer) logic
139+
assertAnalyzesTo(analyzer, "phones", new String[] { "phone" });
140+
assertAnalyzesTo(analyzer, "horses", new String[] { "horse" });
141+
assertAnalyzesTo(analyzer, "cameras", new String[] { "camera" });
142+
143+
// The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
144+
// (see https://howtospell.co.uk/making-O-words-plural )
145+
// This stemmer removes the es but retains e for a small number of exceptions
146+
assertAnalyzesTo(analyzer, "mosquitoes", new String[] { "mosquito" });
147+
assertAnalyzesTo(analyzer, "heroes", new String[] { "hero" });
148+
// oes exceptions that retain the e.
149+
assertAnalyzesTo(analyzer, "shoes", new String[] { "shoe" });
150+
assertAnalyzesTo(analyzer, "horseshoes", new String[] { "horseshoe" });
151+
assertAnalyzesTo(analyzer, "canoes", new String[] { "canoe" });
152+
assertAnalyzesTo(analyzer, "oboes", new String[] { "oboe" });
153+
154+
// Check improved EnglishPluralStemFilter logic
155+
// sses
156+
assertAnalyzesTo(analyzer, "dresses", new String[] { "dress" });
157+
assertAnalyzesTo(analyzer, "possess", new String[] { "possess" });
158+
assertAnalyzesTo(analyzer, "possesses", new String[] { "possess" });
159+
// xes
160+
assertAnalyzesTo(analyzer, "boxes", new String[] { "box" });
161+
assertAnalyzesTo(analyzer, "axes", new String[] { "axe" });
162+
// shes
163+
assertAnalyzesTo(analyzer, "dishes", new String[] { "dish" });
164+
assertAnalyzesTo(analyzer, "washes", new String[] { "wash" });
165+
// ees
166+
assertAnalyzesTo(analyzer, "employees", new String[] { "employee" });
167+
assertAnalyzesTo(analyzer, "bees", new String[] { "bee" });
168+
// tch
169+
assertAnalyzesTo(analyzer, "watches", new String[] { "watch" });
170+
assertAnalyzesTo(analyzer, "itches", new String[] { "itch" });
171+
// ies->y but only for length >4
172+
assertAnalyzesTo(analyzer, "spies", new String[] { "spy" });
173+
assertAnalyzesTo(analyzer, "ties", new String[] { "tie" });
174+
assertAnalyzesTo(analyzer, "lies", new String[] { "lie" });
175+
assertAnalyzesTo(analyzer, "pies", new String[] { "pie" });
176+
assertAnalyzesTo(analyzer, "dies", new String[] { "die" });
177+
178+
assertAnalyzesTo(analyzer, "lunches", new String[] { "lunch" });
179+
assertAnalyzesTo(analyzer, "avalanches", new String[] { "avalanche" });
180+
assertAnalyzesTo(analyzer, "headaches", new String[] { "headache" });
181+
assertAnalyzesTo(analyzer, "caches", new String[] { "cache" });
182+
assertAnalyzesTo(analyzer, "beaches", new String[] { "beach" });
183+
assertAnalyzesTo(analyzer, "britches", new String[] { "britch" });
184+
assertAnalyzesTo(analyzer, "cockroaches", new String[] { "cockroach" });
185+
assertAnalyzesTo(analyzer, "cliches", new String[] { "cliche" });
186+
assertAnalyzesTo(analyzer, "quiches", new String[] { "quiche" });
187+
188+
}
189+
}
190+
114191
public void testMultipleLanguagesThrowsException() throws IOException {
115192
Version v = VersionUtils.randomVersion(random());
116193
Settings settings = Settings.builder()

0 commit comments

Comments
 (0)