Skip to content

Commit 320611a

Browse files
HUSTERGSgesong.samuelmsfroh
authored
fix escaped wildcard query on wildcard field (opensearch-project#15737)
* fix escaped wildcard query on wildcard field Signed-off-by: gesong.samuel <gesong.samuel@bytedance.com> * fix format error Signed-off-by: gesong.samuel <gesong.samuel@bytedance.com> * add change log Signed-off-by: gesong.samuel <gesong.samuel@bytedance.com> --------- Signed-off-by: gesong.samuel <gesong.samuel@bytedance.com> Signed-off-by: Michael Froh <froh@amazon.com> Co-authored-by: gesong.samuel <gesong.samuel@bytedance.com> Co-authored-by: Michael Froh <froh@amazon.com>
1 parent f8515c7 commit 320611a

File tree

3 files changed

+68
-13
lines changed

3 files changed

+68
-13
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2626
### Removed
2727

2828
### Fixed
29+
- Fix wildcard query containing escaped character ([#15737](https://github.com/opensearch-project/OpenSearch/pull/15737))
2930

3031
### Security
3132

server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java

+41-13
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
import org.apache.lucene.util.automaton.RegExp;
4141
import org.opensearch.common.lucene.BytesRefs;
4242
import org.opensearch.common.lucene.Lucene;
43-
import org.opensearch.common.regex.Regex;
4443
import org.opensearch.common.unit.Fuzziness;
4544
import org.opensearch.core.xcontent.XContentParser;
4645
import org.opensearch.index.analysis.IndexAnalyzers;
@@ -430,22 +429,27 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
430429
finalValue = value;
431430
}
432431
Predicate<String> matchPredicate;
433-
if (value.contains("?")) {
434-
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue));
435-
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);
432+
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue));
433+
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);
434+
if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.SINGLE) {
435+
// when type equals SINGLE, #compiledAutomaton.runAutomaton is null
436436
matchPredicate = s -> {
437437
if (caseInsensitive) {
438438
s = s.toLowerCase(Locale.ROOT);
439439
}
440-
BytesRef valueBytes = BytesRefs.toBytesRef(s);
441-
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
440+
return s.equals(finalValue);
442441
};
442+
} else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) {
443+
return existsQuery(context);
444+
} else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.NONE) {
445+
return new MatchNoDocsQuery("Wildcard expression matches nothing");
443446
} else {
444447
matchPredicate = s -> {
445448
if (caseInsensitive) {
446449
s = s.toLowerCase(Locale.ROOT);
447450
}
448-
return Regex.simpleMatch(finalValue, s);
451+
BytesRef valueBytes = BytesRefs.toBytesRef(s);
452+
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
449453
};
450454
}
451455

@@ -468,22 +472,30 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
468472
// Package-private for testing
469473
static Set<String> getRequiredNGrams(String value) {
470474
Set<String> terms = new HashSet<>();
475+
476+
if (value.isEmpty()) {
477+
return terms;
478+
}
479+
471480
int pos = 0;
481+
String rawSequence = null;
472482
String currentSequence = null;
473483
if (!value.startsWith("?") && !value.startsWith("*")) {
474484
// Can add prefix term
475-
currentSequence = getNonWildcardSequence(value, 0);
485+
rawSequence = getNonWildcardSequence(value, 0);
486+
currentSequence = performEscape(rawSequence);
476487
if (currentSequence.length() == 1) {
477488
terms.add(new String(new char[] { 0, currentSequence.charAt(0) }));
478489
} else {
479490
terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) }));
480491
}
481492
} else {
482493
pos = findNonWildcardSequence(value, pos);
483-
currentSequence = getNonWildcardSequence(value, pos);
494+
rawSequence = getNonWildcardSequence(value, pos);
484495
}
485496
while (pos < value.length()) {
486-
boolean isEndOfValue = pos + currentSequence.length() == value.length();
497+
boolean isEndOfValue = pos + rawSequence.length() == value.length();
498+
currentSequence = performEscape(rawSequence);
487499
if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) {
488500
// If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor.
489501
terms.add(currentSequence);
@@ -502,16 +514,16 @@ static Set<String> getRequiredNGrams(String value) {
502514
terms.add(new String(new char[] { a, b, 0 }));
503515
}
504516
}
505-
pos = findNonWildcardSequence(value, pos + currentSequence.length());
506-
currentSequence = getNonWildcardSequence(value, pos);
517+
pos = findNonWildcardSequence(value, pos + rawSequence.length());
518+
rawSequence = getNonWildcardSequence(value, pos);
507519
}
508520
return terms;
509521
}
510522

511523
private static String getNonWildcardSequence(String value, int startFrom) {
512524
for (int i = startFrom; i < value.length(); i++) {
513525
char c = value.charAt(i);
514-
if (c == '?' || c == '*') {
526+
if ((c == '?' || c == '*') && (i == 0 || value.charAt(i - 1) != '\\')) {
515527
return value.substring(startFrom, i);
516528
}
517529
}
@@ -529,6 +541,22 @@ private static int findNonWildcardSequence(String value, int startFrom) {
529541
return value.length();
530542
}
531543

544+
private static String performEscape(String str) {
545+
StringBuilder sb = new StringBuilder();
546+
for (int i = 0; i < str.length(); i++) {
547+
if (str.charAt(i) == '\\' && (i + 1) < str.length()) {
548+
char c = str.charAt(i + 1);
549+
if (c == '*' || c == '?') {
550+
i++;
551+
}
552+
}
553+
sb.append(str.charAt(i));
554+
}
555+
assert !sb.toString().contains("\\*");
556+
assert !sb.toString().contains("\\?");
557+
return sb.toString();
558+
}
559+
532560
@Override
533561
public Query regexpQuery(
534562
String value,

server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java

+26
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,32 @@ public void testWildcardQuery() {
8888
);
8989
}
9090

91+
public void testEscapedWildcardQuery() {
92+
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");
93+
Set<String> expectedTerms = new HashSet<>();
94+
expectedTerms.add(prefixAnchored("*"));
95+
expectedTerms.add(suffixAnchored("*"));
96+
97+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
98+
for (String term : expectedTerms) {
99+
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
100+
}
101+
102+
assertEquals(
103+
new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**\\*"),
104+
ft.wildcardQuery("\\**\\*", null, null)
105+
);
106+
107+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null));
108+
109+
expectedTerms.remove(suffixAnchored("*"));
110+
builder = new BooleanQuery.Builder();
111+
for (String term : expectedTerms) {
112+
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
113+
}
114+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), ft.wildcardQuery("\\**", null, null));
115+
}
116+
91117
public void testMultipleWildcardsInQuery() {
92118
final String pattern = "a?cd*efg?h";
93119
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");

0 commit comments

Comments
 (0)