|
34 | 34 | import org.apache.lucene.util.BytesRef;
|
35 | 35 | import org.apache.lucene.util.automaton.Automaton;
|
36 | 36 | import org.apache.lucene.util.automaton.CompiledAutomaton;
|
| 37 | +import org.apache.lucene.util.automaton.RegExp; |
37 | 38 | import org.opensearch.common.lucene.BytesRefs;
|
38 | 39 | import org.opensearch.common.lucene.Lucene;
|
39 | 40 | import org.opensearch.common.regex.Regex;
|
|
52 | 53 | import java.io.IOException;
|
53 | 54 | import java.io.StringReader;
|
54 | 55 | import java.io.UncheckedIOException;
|
| 56 | +import java.util.ArrayList; |
55 | 57 | import java.util.Arrays;
|
56 | 58 | import java.util.Collections;
|
57 | 59 | import java.util.HashSet;
|
|
62 | 64 | import java.util.Set;
|
63 | 65 | import java.util.function.Predicate;
|
64 | 66 | import java.util.function.Supplier;
|
65 |
| -import java.util.regex.Pattern; |
66 | 67 |
|
67 | 68 | import static org.opensearch.index.mapper.KeywordFieldMapper.normalizeValue;
|
68 | 69 |
|
@@ -500,17 +501,76 @@ public Query regexpQuery(
|
500 | 501 | MultiTermQuery.RewriteMethod method,
|
501 | 502 | QueryShardContext context
|
502 | 503 | ) {
|
503 |
| - // TODO -- Extracting mandatory characters from a regex is not trivial, since entire blocks may be optional. |
504 |
| - // It is functionally correct to approximate with MatchAllDocs, but performance won't be good. |
505 |
| - return new WildcardMatchingQuery( |
506 |
| - name(), |
507 |
| - new MatchAllDocsQuery(), |
508 |
| - Pattern.compile(value).asMatchPredicate(), |
509 |
| - "/" + value + "/", |
510 |
| - context, |
511 |
| - this |
512 |
| - ); |
513 |
| - |
| 504 | + RegExp regExp = new RegExp(value, syntaxFlags, matchFlags); |
| 505 | + Automaton automaton = regExp.toAutomaton(maxDeterminizedStates); |
| 506 | + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); |
| 507 | + |
| 508 | + return new WildcardMatchingQuery(name(), regexpToQuery(name(), regExp), s -> { |
| 509 | + BytesRef valueBytes = BytesRefs.toBytesRef(s); |
| 510 | + return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); |
| 511 | + }, "/" + value + "/", context, this); |
| 512 | + } |
| 513 | + |
| 514 | + /** |
| 515 | + * Implement the match rules described in <a href="https://swtch.com/~rsc/regexp/regexp4.html">Regular Expression Matching with a Trigram Index</a>. |
| 516 | + * |
| 517 | + * @param fieldName name of the wildcard field |
| 518 | + * @param regExp a parsed node in the {@link RegExp} tree |
| 519 | + * @return a query that matches on the known required parts of the given regular expression |
| 520 | + */ |
| 521 | + private static Query regexpToQuery(String fieldName, RegExp regExp) { |
| 522 | + Query query; |
| 523 | + if (Objects.requireNonNull(regExp.kind) == RegExp.Kind.REGEXP_UNION) { |
| 524 | + List<Query> clauses = new ArrayList<>(); |
| 525 | + while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) { |
| 526 | + clauses.add(regexpToQuery(fieldName, regExp.exp2)); |
| 527 | + regExp = regExp.exp1; |
| 528 | + } |
| 529 | + clauses.add(regexpToQuery(fieldName, regExp.exp2)); |
| 530 | + clauses.add(regexpToQuery(fieldName, regExp.exp1)); |
| 531 | + BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
| 532 | + for (int i = clauses.size() - 1; i >= 0; i--) { |
| 533 | + Query clause = clauses.get(i); |
| 534 | + if (clause instanceof MatchAllDocsQuery) { |
| 535 | + return clause; |
| 536 | + } |
| 537 | + builder.add(clause, BooleanClause.Occur.SHOULD); |
| 538 | + } |
| 539 | + query = builder.build(); |
| 540 | + } else if (regExp.kind == RegExp.Kind.REGEXP_STRING) { |
| 541 | + BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
| 542 | + for (String string : getRequiredNGrams("*" + regExp.s + "*")) { |
| 543 | + builder.add(new TermQuery(new Term(fieldName, string)), BooleanClause.Occur.FILTER); |
| 544 | + } |
| 545 | + query = builder.build(); |
| 546 | + } else if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION) { |
| 547 | + List<Query> clauses = new ArrayList<>(); |
| 548 | + while (regExp.exp1.kind == RegExp.Kind.REGEXP_CONCATENATION) { |
| 549 | + clauses.add(regexpToQuery(fieldName, regExp.exp2)); |
| 550 | + regExp = regExp.exp1; |
| 551 | + } |
| 552 | + clauses.add(regexpToQuery(fieldName, regExp.exp2)); |
| 553 | + clauses.add(regexpToQuery(fieldName, regExp.exp1)); |
| 554 | + BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
| 555 | + for (int i = clauses.size() - 1; i >= 0; i--) { |
| 556 | + Query clause = clauses.get(i); |
| 557 | + if (!(clause instanceof MatchAllDocsQuery)) { |
| 558 | + builder.add(clause, BooleanClause.Occur.FILTER); |
| 559 | + } |
| 560 | + } |
| 561 | + query = builder.build(); |
| 562 | + } else if (regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) { |
| 563 | + return regexpToQuery(fieldName, regExp.exp1); |
| 564 | + } else { |
| 565 | + return new MatchAllDocsQuery(); |
| 566 | + } |
| 567 | + if (query instanceof BooleanQuery) { |
| 568 | + BooleanQuery booleanQuery = (BooleanQuery) query; |
| 569 | + if (booleanQuery.clauses().size() == 1) { |
| 570 | + return booleanQuery.iterator().next().getQuery(); |
| 571 | + } |
| 572 | + } |
| 573 | + return query; |
514 | 574 | }
|
515 | 575 |
|
516 | 576 | @Override
|
@@ -711,6 +771,11 @@ public boolean isCacheable(LeafReaderContext leafReaderContext) {
|
711 | 771 | }
|
712 | 772 | };
|
713 | 773 | }
|
| 774 | + |
| 775 | + // Visible for testing |
| 776 | + Predicate<String> getSecondPhaseMatcher() { |
| 777 | + return secondPhaseMatcher; |
| 778 | + } |
714 | 779 | }
|
715 | 780 |
|
716 | 781 | @Override
|
|
0 commit comments