Skip to content

Commit 302a3fd

Browse files
authored
Remove MinimizationOperations (#17268)
* Remove MinimizationOperations This class was removed from Lucene because it is no longer needed. It was copied into the OpenSearch code base because we were using it and we didn't want our code to break. In fact, the correct choice would have been to remove the references to `MinimizationOperations.minimize()` and either replace them with calls to `Operations.determinize()` or simply drop them altogether because the automaton is deterministic. Signed-off-by: Michael Froh <froh@amazon.com> * Add changelog entry Signed-off-by: Michael Froh <froh@amazon.com> --------- Signed-off-by: Michael Froh <froh@amazon.com>
1 parent e93791b commit 302a3fd

File tree

10 files changed

+22
-411
lines changed

10 files changed

+22
-411
lines changed

CHANGELOG-3.0.md

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3434
- Refactor `:libs` module `bootstrap` package to eliminate top level split packages for JPMS support ([#17117](https://github.com/opensearch-project/OpenSearch/pull/17117))
3535
- Refactor the codebase to eliminate top level split packages for JPMS support ([#17153](https://github.com/opensearch-project/OpenSearch/pull/17153)
3636
- Refactor `:server` module `org.apacge.lucene` package to eliminate top level split packages for JPMS support ([#17241](https://github.com/opensearch-project/OpenSearch/pull/17241))
37+
- Stop minimizing automata used for case-insensitive matches ([#17268](https://github.com/opensearch-project/OpenSearch/pull/17268))
3738
- Refactor the `:server` module `org.opensearch.client` to `org.opensearch.transport.client` to eliminate top level split packages for JPMS support ([#17272](https://github.com/opensearch-project/OpenSearch/pull/17272))
3839

3940
### Deprecated

modules/reindex/src/main/java/org/opensearch/index/reindex/ReindexValidator.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
import org.opensearch.common.settings.Settings;
5050
import org.opensearch.core.common.Strings;
5151
import org.opensearch.index.IndexNotFoundException;
52-
import org.opensearch.lucene.util.automaton.MinimizationOperations;
5352
import org.opensearch.search.builder.SearchSourceBuilder;
5453

5554
import java.util.List;
@@ -114,7 +113,7 @@ static CharacterRunAutomaton buildRemoteAllowlist(List<String> allowlist) {
114113
return new CharacterRunAutomaton(Automata.makeEmpty());
115114
}
116115
Automaton automaton = Regex.simpleMatchToAutomaton(allowlist.toArray(Strings.EMPTY_ARRAY));
117-
automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
116+
automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
118117
if (Operations.isTotal(automaton)) {
119118
throw new IllegalArgumentException(
120119
"Refusing to start because allowlist "

server/src/main/java/org/opensearch/common/lucene/search/AutomatonQueries.java

+12-18
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
import org.apache.lucene.util.automaton.Automata;
4040
import org.apache.lucene.util.automaton.Automaton;
4141
import org.apache.lucene.util.automaton.Operations;
42-
import org.opensearch.lucene.util.automaton.MinimizationOperations;
4342

4443
import java.util.ArrayList;
4544
import java.util.Iterator;
@@ -57,12 +56,12 @@ public static Automaton caseInsensitivePrefix(String s) {
5756
List<Automaton> list = new ArrayList<>();
5857
Iterator<Integer> iter = s.codePoints().iterator();
5958
while (iter.hasNext()) {
60-
list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
59+
list.add(toCaseInsensitiveChar(iter.next()));
6160
}
6261
list.add(Automata.makeAnyString());
6362

6463
Automaton a = Operations.concatenate(list);
65-
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
64+
assert a.isDeterministic();
6665
return a;
6766
}
6867

@@ -85,14 +84,14 @@ public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix, MultiTermQu
8584
*/
8685
public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
8786
BytesRef prefix = term.bytes();
88-
return new AutomatonQuery(term, toCaseInsensitiveString(prefix, Integer.MAX_VALUE));
87+
return new AutomatonQuery(term, toCaseInsensitiveString(prefix));
8988
}
9089

9190
/**
9291
* Build an automaton matching a wildcard pattern, ASCII case insensitive, if the method is null, then will use {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE}.
9392
*/
9493
public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery, MultiTermQuery.RewriteMethod method) {
95-
return createAutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery, Integer.MAX_VALUE), method);
94+
return createAutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery), method);
9695
}
9796

9897
/**
@@ -124,7 +123,7 @@ public static AutomatonQuery createAutomatonQuery(Term term, Automaton automaton
124123
* Convert Lucene wildcard syntax into an automaton.
125124
*/
126125
@SuppressWarnings("fallthrough")
127-
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
126+
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery) {
128127
List<Automaton> automata = new ArrayList<>();
129128

130129
String wildcardText = wildcardquery.text();
@@ -148,32 +147,28 @@ public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, i
148147
break;
149148
} // else fallthru, lenient parsing with a trailing \
150149
default:
151-
automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
150+
automata.add(toCaseInsensitiveChar(c));
152151
}
153152
i += length;
154153
}
155154

156155
return Operations.concatenate(automata);
157156
}
158157

159-
protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
160-
return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
158+
protected static Automaton toCaseInsensitiveString(BytesRef br) {
159+
return toCaseInsensitiveString(br.utf8ToString());
161160
}
162161

163-
public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
162+
public static Automaton toCaseInsensitiveString(String s) {
164163
List<Automaton> list = new ArrayList<>();
165164
Iterator<Integer> iter = s.codePoints().iterator();
166165
while (iter.hasNext()) {
167-
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
166+
list.add(toCaseInsensitiveChar(iter.next()));
168167
}
169-
170-
Automaton a = Operations.concatenate(list);
171-
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
172-
return a;
173-
168+
return Operations.concatenate(list);
174169
}
175170

176-
public static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
171+
public static Automaton toCaseInsensitiveChar(int codepoint) {
177172
Automaton case1 = Automata.makeChar(codepoint);
178173
// For now we only work with ASCII characters
179174
if (codepoint > 128) {
@@ -183,7 +178,6 @@ public static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminized
183178
Automaton result;
184179
if (altCase != codepoint) {
185180
result = Operations.union(case1, Automata.makeChar(altCase));
186-
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
187181
} else {
188182
result = case1;
189183
}

server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ public Query termQueryCaseInsensitive(Object value, QueryShardContext context) {
411411
Term term = new Term(name(), bytesRef);
412412
Query query = AutomatonQueries.createAutomatonQuery(
413413
term,
414-
AutomatonQueries.toCaseInsensitiveString(bytesRef.utf8ToString(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT),
414+
AutomatonQueries.toCaseInsensitiveString(bytesRef.utf8ToString()),
415415
MultiTermQuery.DOC_VALUES_REWRITE
416416
);
417417
if (boost() != 1f) {

server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, bool
640640
}
641641
List<Automaton> automata = new ArrayList<>();
642642
if (caseInsensitive) {
643-
automata.add(AutomatonQueries.toCaseInsensitiveString(value, Integer.MAX_VALUE));
643+
automata.add(AutomatonQueries.toCaseInsensitiveString(value));
644644
} else {
645645
automata.add(Automata.makeString(value));
646646
}

server/src/main/java/org/opensearch/indices/SystemIndices.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import org.opensearch.common.Nullable;
4242
import org.opensearch.common.regex.Regex;
4343
import org.opensearch.core.index.Index;
44-
import org.opensearch.lucene.util.automaton.MinimizationOperations;
4544

4645
import java.util.Collection;
4746
import java.util.List;
@@ -147,6 +146,8 @@ private static CharacterRunAutomaton buildCharacterRunAutomaton(Collection<Syste
147146
Optional<Automaton> automaton = descriptors.stream()
148147
.map(descriptor -> Regex.simpleMatchToAutomaton(descriptor.getIndexPattern()))
149148
.reduce(Operations::union);
150-
return new CharacterRunAutomaton(MinimizationOperations.minimize(automaton.orElse(Automata.makeEmpty()), Integer.MAX_VALUE));
149+
return new CharacterRunAutomaton(
150+
Operations.determinize(automaton.orElse(Automata.makeEmpty()), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)
151+
);
151152
}
152153
}

0 commit comments

Comments
 (0)