Skip to content

Commit f47573d

Browse files
committed
Add efficient first-phase implementation for regexp queries
Following the description in https://swtch.com/~rsc/regexp/regexp4.html, along with the fact that Lucene gives us a regular expression parse tree, we *can* implement an efficient first-phase match on regular expressions without needing to write a lot of code. Signed-off-by: Michael Froh <froh@amazon.com>
1 parent a39ffaf commit f47573d

File tree

3 files changed

+137
-22
lines changed

3 files changed

+137
-22
lines changed

server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java

+77-12
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.lucene.util.BytesRef;
3535
import org.apache.lucene.util.automaton.Automaton;
3636
import org.apache.lucene.util.automaton.CompiledAutomaton;
37+
import org.apache.lucene.util.automaton.RegExp;
3738
import org.opensearch.common.lucene.BytesRefs;
3839
import org.opensearch.common.lucene.Lucene;
3940
import org.opensearch.common.regex.Regex;
@@ -52,6 +53,7 @@
5253
import java.io.IOException;
5354
import java.io.StringReader;
5455
import java.io.UncheckedIOException;
56+
import java.util.ArrayList;
5557
import java.util.Arrays;
5658
import java.util.Collections;
5759
import java.util.HashSet;
@@ -62,7 +64,6 @@
6264
import java.util.Set;
6365
import java.util.function.Predicate;
6466
import java.util.function.Supplier;
65-
import java.util.regex.Pattern;
6667

6768
import static org.opensearch.index.mapper.KeywordFieldMapper.normalizeValue;
6869

@@ -500,17 +501,76 @@ public Query regexpQuery(
500501
MultiTermQuery.RewriteMethod method,
501502
QueryShardContext context
502503
) {
503-
// TODO -- Extracting mandatory characters from a regex is not trivial, since entire blocks may be optional.
504-
// It is functionally correct to approximate with MatchAllDocs, but performance won't be good.
505-
return new WildcardMatchingQuery(
506-
name(),
507-
new MatchAllDocsQuery(),
508-
Pattern.compile(value).asMatchPredicate(),
509-
"/" + value + "/",
510-
context,
511-
this
512-
);
513-
504+
RegExp regExp = new RegExp(value, syntaxFlags, matchFlags);
505+
Automaton automaton = regExp.toAutomaton(maxDeterminizedStates);
506+
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);
507+
508+
return new WildcardMatchingQuery(name(), regexpToQuery(name(), regExp), s -> {
509+
BytesRef valueBytes = BytesRefs.toBytesRef(s);
510+
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
511+
}, "/" + value + "/", context, this);
512+
}
513+
514+
/**
515+
* Implement the match rules described in <a href="https://swtch.com/~rsc/regexp/regexp4.html">Regular Expression Matching with a Trigram Index</a>.
516+
*
517+
* @param fieldName name of the wildcard field
518+
* @param regExp a parsed node in the {@link RegExp} tree
519+
* @return a query that matches on the known required parts of the given regular expression
520+
*/
521+
private static Query regexpToQuery(String fieldName, RegExp regExp) {
522+
Query query;
523+
if (Objects.requireNonNull(regExp.kind) == RegExp.Kind.REGEXP_UNION) {
524+
List<Query> clauses = new ArrayList<>();
525+
while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
526+
clauses.add(regexpToQuery(fieldName, regExp.exp2));
527+
regExp = regExp.exp1;
528+
}
529+
clauses.add(regexpToQuery(fieldName, regExp.exp2));
530+
clauses.add(regexpToQuery(fieldName, regExp.exp1));
531+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
532+
for (int i = clauses.size() - 1; i >= 0; i--) {
533+
Query clause = clauses.get(i);
534+
if (clause instanceof MatchAllDocsQuery) {
535+
return clause;
536+
}
537+
builder.add(clause, BooleanClause.Occur.SHOULD);
538+
}
539+
query = builder.build();
540+
} else if (regExp.kind == RegExp.Kind.REGEXP_STRING) {
541+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
542+
for (String string : getRequiredNGrams("*" + regExp.s + "*")) {
543+
builder.add(new TermQuery(new Term(fieldName, string)), BooleanClause.Occur.FILTER);
544+
}
545+
query = builder.build();
546+
} else if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION) {
547+
List<Query> clauses = new ArrayList<>();
548+
while (regExp.exp1.kind == RegExp.Kind.REGEXP_CONCATENATION) {
549+
clauses.add(regexpToQuery(fieldName, regExp.exp2));
550+
regExp = regExp.exp1;
551+
}
552+
clauses.add(regexpToQuery(fieldName, regExp.exp2));
553+
clauses.add(regexpToQuery(fieldName, regExp.exp1));
554+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
555+
for (int i = clauses.size() - 1; i >= 0; i--) {
556+
Query clause = clauses.get(i);
557+
if (!(clause instanceof MatchAllDocsQuery)) {
558+
builder.add(clause, BooleanClause.Occur.FILTER);
559+
}
560+
}
561+
query = builder.build();
562+
} else if (regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) {
563+
return regexpToQuery(fieldName, regExp.exp1);
564+
} else {
565+
return new MatchAllDocsQuery();
566+
}
567+
if (query instanceof BooleanQuery) {
568+
BooleanQuery booleanQuery = (BooleanQuery) query;
569+
if (booleanQuery.clauses().size() == 1) {
570+
return booleanQuery.iterator().next().getQuery();
571+
}
572+
}
573+
return query;
514574
}
515575

516576
@Override
@@ -711,6 +771,11 @@ public boolean isCacheable(LeafReaderContext leafReaderContext) {
711771
}
712772
};
713773
}
774+
775+
// Visible for testing
776+
Predicate<String> getSecondPhaseMatcher() {
777+
return secondPhaseMatcher;
778+
}
714779
}
715780

716781
@Override

server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTest.java server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import java.util.ArrayList;
1818
import java.util.List;
1919

20-
public class WildcardFieldMapperTest extends MapperTestCase {
20+
public class WildcardFieldMapperTests extends MapperTestCase {
2121

2222
@Override
2323
protected void minimalMapping(XContentBuilder b) throws IOException {
@@ -49,8 +49,8 @@ public void testTokenizer() throws IOException {
4949
}
5050
assertEquals(
5151
List.of(
52-
WildcardFieldTypeTest.prefixAnchored("p"),
53-
WildcardFieldTypeTest.prefixAnchored("pi"),
52+
WildcardFieldTypeTests.prefixAnchored("p"),
53+
WildcardFieldTypeTests.prefixAnchored("pi"),
5454
"p",
5555
"pi",
5656
"pic",
@@ -65,9 +65,9 @@ public void testTokenizer() throws IOException {
6565
"kle",
6666
"l",
6767
"le",
68-
WildcardFieldTypeTest.suffixAnchored("le"),
68+
WildcardFieldTypeTests.suffixAnchored("le"),
6969
"e",
70-
WildcardFieldTypeTest.suffixAnchored("e")
70+
WildcardFieldTypeTests.suffixAnchored("e")
7171
),
7272
terms
7373
);

server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTest.java server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java

+55-5
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@
1111
import org.apache.lucene.index.Term;
1212
import org.apache.lucene.search.BooleanClause;
1313
import org.apache.lucene.search.BooleanQuery;
14+
import org.apache.lucene.search.Query;
1415
import org.apache.lucene.search.TermQuery;
1516

1617
import java.util.HashSet;
1718
import java.util.Set;
1819

19-
public class WildcardFieldTypeTest extends FieldTypeTestCase {
20+
public class WildcardFieldTypeTests extends FieldTypeTestCase {
2021

2122
static String prefixAnchored(String val) {
2223
return (char) 0 + val;
@@ -38,7 +39,13 @@ public void testTermQuery() {
3839
for (String term : expectedTerms) {
3940
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
4041
}
41-
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), ft.termQuery("apple", null));
42+
Query actual = ft.termQuery("apple", null);
43+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), actual);
44+
WildcardFieldMapper.WildcardMatchingQuery actualTermQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
45+
assertTrue(actualTermQuery.getSecondPhaseMatcher().test("apple"));
46+
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("Apple"));
47+
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("flapple"));
48+
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("apples"));
4249
}
4350

4451
public void testWildcardQuery() {
@@ -94,9 +101,52 @@ public void testMultipleWildcardsInQuery() {
94101
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
95102
}
96103

97-
assertEquals(
98-
new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern),
99-
ft.wildcardQuery(pattern, null, null)
104+
Query actual = ft.wildcardQuery(pattern, null, null);
105+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern), actual);
106+
WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
107+
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqh"));
108+
assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqqh"));
109+
}
110+
111+
public void testRegexpQuery() {
112+
String pattern = ".*apple.*";
113+
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");
114+
115+
Set<String> expectedTerms = new HashSet<>();
116+
expectedTerms.add("app");
117+
expectedTerms.add("ppl");
118+
expectedTerms.add("ple");
119+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
120+
for (String term : expectedTerms) {
121+
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
122+
}
123+
124+
Query actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null);
125+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual);
126+
WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
127+
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo"));
128+
assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo"));
129+
130+
pattern = "ab(zz|cd|ef.*)(hi|jk)";
131+
builder = new BooleanQuery.Builder();
132+
builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER);
133+
builder.add(
134+
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD)
135+
.add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD)
136+
.add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD)
137+
.build(),
138+
BooleanClause.Occur.FILTER
139+
);
140+
builder.add(
141+
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD)
142+
.add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD)
143+
.build(),
144+
BooleanClause.Occur.FILTER
100145
);
146+
actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null);
147+
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual);
148+
actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
149+
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk"));
150+
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi"));
101151
}
102152
}

0 commit comments

Comments
 (0)