Skip to content

Commit 3403028

Browse files
Quickly compute terms aggregations when the top-level query is functionally match-all for a segment (#11643)
--------- Signed-off-by: Sandesh Kumar <sandeshkr419@gmail.com> (cherry picked from commit 7dac98c) Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 1235756 commit 3403028

File tree

8 files changed

+431
-82
lines changed

8 files changed

+431
-82
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
4040

4141
### Changed
4242
- Allow composite aggregation to run under a parent filter aggregation ([#11499](https://github.com/opensearch-project/OpenSearch/pull/11499))
43+
- Quickly compute terms aggregations when the top-level query is functionally match-all for a segment ([#11643](https://github.com/opensearch-project/OpenSearch/pull/11643))
4344

4445
### Deprecated
4546

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

+111-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,13 @@
3535
import org.apache.lucene.index.DocValues;
3636
import org.apache.lucene.index.IndexReader;
3737
import org.apache.lucene.index.LeafReaderContext;
38+
import org.apache.lucene.index.NumericDocValues;
3839
import org.apache.lucene.index.SortedDocValues;
3940
import org.apache.lucene.index.SortedSetDocValues;
41+
import org.apache.lucene.index.Terms;
42+
import org.apache.lucene.index.TermsEnum;
43+
import org.apache.lucene.search.CollectionTerminatedException;
44+
import org.apache.lucene.search.Weight;
4045
import org.apache.lucene.util.ArrayUtil;
4146
import org.apache.lucene.util.BytesRef;
4247
import org.apache.lucene.util.PriorityQueue;
@@ -46,6 +51,7 @@
4651
import org.opensearch.common.util.LongHash;
4752
import org.opensearch.core.common.io.stream.StreamOutput;
4853
import org.opensearch.core.xcontent.XContentBuilder;
54+
import org.opensearch.index.mapper.DocCountFieldMapper;
4955
import org.opensearch.search.DocValueFormat;
5056
import org.opensearch.search.aggregations.AggregationExecutionException;
5157
import org.opensearch.search.aggregations.Aggregator;
@@ -73,6 +79,7 @@
7379

7480
import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
7581
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
82+
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
7683

7784
/**
7885
* An aggregator of string values that relies on global ordinals in order to build buckets.
@@ -85,6 +92,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
8592

8693
private final LongPredicate acceptedGlobalOrdinals;
8794
private final long valueCount;
95+
private final String fieldName;
96+
private Weight weight;
8897
private final GlobalOrdLookupFunction lookupGlobalOrd;
8998
protected final CollectionStrategy collectionStrategy;
9099
protected int segmentsWithSingleValuedOrds = 0;
@@ -136,16 +145,105 @@ public GlobalOrdinalsStringTermsAggregator(
136145
return new DenseGlobalOrds();
137146
});
138147
}
148+
this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
149+
? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
150+
: null;
139151
}
140152

141153
String descriptCollectionStrategy() {
142154
return collectionStrategy.describe();
143155
}
144156

157+
public void setWeight(Weight weight) {
158+
this.weight = weight;
159+
}
160+
161+
/**
162+
Read doc frequencies directly from indexed terms in the segment to skip iterating through individual documents
163+
@param ctx The LeafReaderContext to collect terms from
164+
@param globalOrds The SortedSetDocValues for the field's ordinals
165+
@param ordCountConsumer A consumer to accept collected term frequencies
166+
@return A LeafBucketCollector implementation with collection termination, since collection is complete
167+
@throws IOException If an I/O error occurs during reading
168+
*/
169+
LeafBucketCollector termDocFreqCollector(
170+
LeafReaderContext ctx,
171+
SortedSetDocValues globalOrds,
172+
BiConsumer<Long, Integer> ordCountConsumer
173+
) throws IOException {
174+
if (weight == null) {
175+
// Weight not assigned - cannot use this optimization
176+
return null;
177+
} else {
178+
if (weight.count(ctx) == 0) {
179+
// No documents matches top level query on this segment, we can skip the segment entirely
180+
return LeafBucketCollector.NO_OP_COLLECTOR;
181+
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
182+
// weight.count(ctx) == ctx.reader().maxDoc() implies there are no deleted documents and
183+
// top-level query matches all docs in the segment
184+
return null;
185+
}
186+
}
187+
188+
Terms segmentTerms = ctx.reader().terms(this.fieldName);
189+
if (segmentTerms == null) {
190+
// Field is not indexed.
191+
return null;
192+
}
193+
194+
NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
195+
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
196+
// This segment has at least one document with the _doc_count field.
197+
return null;
198+
}
199+
200+
TermsEnum indexTermsEnum = segmentTerms.iterator();
201+
BytesRef indexTerm = indexTermsEnum.next();
202+
TermsEnum globalOrdinalTermsEnum = globalOrds.termsEnum();
203+
BytesRef ordinalTerm = globalOrdinalTermsEnum.next();
204+
205+
// Iterate over the terms in the segment, look for matches in the global ordinal terms,
206+
// and increment bucket count when segment terms match global ordinal terms.
207+
while (indexTerm != null && ordinalTerm != null) {
208+
int compare = indexTerm.compareTo(ordinalTerm);
209+
if (compare == 0) {
210+
if (acceptedGlobalOrdinals.test(globalOrdinalTermsEnum.ord())) {
211+
ordCountConsumer.accept(globalOrdinalTermsEnum.ord(), indexTermsEnum.docFreq());
212+
}
213+
indexTerm = indexTermsEnum.next();
214+
ordinalTerm = globalOrdinalTermsEnum.next();
215+
} else if (compare < 0) {
216+
indexTerm = indexTermsEnum.next();
217+
} else {
218+
ordinalTerm = globalOrdinalTermsEnum.next();
219+
}
220+
}
221+
return new LeafBucketCollector() {
222+
@Override
223+
public void collect(int doc, long owningBucketOrd) throws IOException {
224+
throw new CollectionTerminatedException();
225+
}
226+
};
227+
}
228+
145229
@Override
146230
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
147231
SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx);
148232
collectionStrategy.globalOrdsReady(globalOrds);
233+
234+
if (collectionStrategy instanceof DenseGlobalOrds
235+
&& this.resultStrategy instanceof StandardTermsResults
236+
&& sub == LeafBucketCollector.NO_OP_COLLECTOR) {
237+
LeafBucketCollector termDocFreqCollector = termDocFreqCollector(
238+
ctx,
239+
globalOrds,
240+
(ord, docCount) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, ord), docCount)
241+
);
242+
if (termDocFreqCollector != null) {
243+
return termDocFreqCollector;
244+
}
245+
}
246+
149247
SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds);
150248
if (singleValues != null) {
151249
segmentsWithSingleValuedOrds++;
@@ -343,9 +441,20 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol
343441
final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx);
344442
segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount());
345443
assert sub == LeafBucketCollector.NO_OP_COLLECTOR;
346-
final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
347444
mapping = valuesSource.globalOrdinalsMapping(ctx);
348-
// Dense mode doesn't support include/exclude so we don't have to check it here.
445+
446+
if (this.resultStrategy instanceof StandardTermsResults) {
447+
LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(
448+
ctx,
449+
segmentOrds,
450+
(ord, docCount) -> incrementBucketDocCount(mapping.applyAsLong(ord), docCount)
451+
);
452+
if (termDocFreqCollector != null) {
453+
return termDocFreqCollector;
454+
}
455+
}
456+
457+
final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
349458
if (singleValues != null) {
350459
segmentsWithSingleValuedOrds++;
351460
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, segmentOrds) {

server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java

+4
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,10 @@ public FieldData(IndexOrdinalsFieldData indexFieldData) {
244244
this.indexFieldData = indexFieldData;
245245
}
246246

247+
public String getIndexFieldName() {
248+
return this.indexFieldData.getFieldName();
249+
}
250+
247251
@Override
248252
public SortedBinaryDocValues bytesValues(LeafReaderContext context) {
249253
final LeafOrdinalsFieldData atomicFieldData = indexFieldData.load(context);

server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java

+5
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,11 @@ public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
387387
return null;
388388
}
389389
}
390+
391+
@Override
392+
public int count(LeafReaderContext context) throws IOException {
393+
return weight.count(context);
394+
}
390395
};
391396
} else {
392397
return weight;

server/src/test/java/org/opensearch/search/aggregations/bucket/terms/KeywordTermsAggregatorTests.java

+44-32
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
package org.opensearch.search.aggregations.bucket.terms;
3333

3434
import org.apache.lucene.document.Document;
35-
import org.apache.lucene.document.SortedSetDocValuesField;
3635
import org.apache.lucene.index.DirectoryReader;
3736
import org.apache.lucene.index.IndexReader;
3837
import org.apache.lucene.search.IndexSearcher;
@@ -41,7 +40,7 @@
4140
import org.apache.lucene.search.Query;
4241
import org.apache.lucene.store.Directory;
4342
import org.apache.lucene.tests.index.RandomIndexWriter;
44-
import org.apache.lucene.util.BytesRef;
43+
import org.opensearch.common.TriConsumer;
4544
import org.opensearch.index.mapper.KeywordFieldMapper;
4645
import org.opensearch.index.mapper.MappedFieldType;
4746
import org.opensearch.search.aggregations.AggregatorTestCase;
@@ -57,6 +56,8 @@
5756
public class KeywordTermsAggregatorTests extends AggregatorTestCase {
5857
private static final String KEYWORD_FIELD = "keyword";
5958

59+
private static final Consumer<TermsAggregationBuilder> CONFIGURE_KEYWORD_FIELD = agg -> agg.field(KEYWORD_FIELD);
60+
6061
private static final List<String> dataset;
6162
static {
6263
List<String> d = new ArrayList<>(45);
@@ -68,51 +69,63 @@ public class KeywordTermsAggregatorTests extends AggregatorTestCase {
6869
dataset = d;
6970
}
7071

72+
private static final Consumer<InternalMappedTerms> VERIFY_MATCH_ALL_DOCS = agg -> {
73+
assertEquals(9, agg.getBuckets().size());
74+
for (int i = 0; i < 9; i++) {
75+
StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i);
76+
assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i)));
77+
assertThat(bucket.getDocCount(), equalTo(9L - i));
78+
}
79+
};
80+
81+
private static final Consumer<InternalMappedTerms> VERIFY_MATCH_NO_DOCS = agg -> { assertEquals(0, agg.getBuckets().size()); };
82+
83+
private static final Query MATCH_ALL_DOCS_QUERY = new MatchAllDocsQuery();
84+
85+
private static final Query MATCH_NO_DOCS_QUERY = new MatchNoDocsQuery();
86+
7187
public void testMatchNoDocs() throws IOException {
7288
testSearchCase(
73-
new MatchNoDocsQuery(),
89+
ADD_SORTED_SET_FIELD_NOT_INDEXED,
90+
MATCH_NO_DOCS_QUERY,
7491
dataset,
75-
aggregation -> aggregation.field(KEYWORD_FIELD),
76-
agg -> assertEquals(0, agg.getBuckets().size()),
77-
null // without type hint
92+
CONFIGURE_KEYWORD_FIELD,
93+
VERIFY_MATCH_NO_DOCS,
94+
null // without type hint
7895
);
7996

8097
testSearchCase(
81-
new MatchNoDocsQuery(),
98+
ADD_SORTED_SET_FIELD_NOT_INDEXED,
99+
MATCH_NO_DOCS_QUERY,
82100
dataset,
83-
aggregation -> aggregation.field(KEYWORD_FIELD),
84-
agg -> assertEquals(0, agg.getBuckets().size()),
85-
ValueType.STRING // with type hint
101+
CONFIGURE_KEYWORD_FIELD,
102+
VERIFY_MATCH_NO_DOCS,
103+
ValueType.STRING // with type hint
86104
);
87105
}
88106

89107
public void testMatchAllDocs() throws IOException {
90-
Query query = new MatchAllDocsQuery();
91-
92-
testSearchCase(query, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> {
93-
assertEquals(9, agg.getBuckets().size());
94-
for (int i = 0; i < 9; i++) {
95-
StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i);
96-
assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i)));
97-
assertThat(bucket.getDocCount(), equalTo(9L - i));
98-
}
99-
},
100-
null // without type hint
108+
testSearchCase(
109+
ADD_SORTED_SET_FIELD_NOT_INDEXED,
110+
MATCH_ALL_DOCS_QUERY,
111+
dataset,
112+
CONFIGURE_KEYWORD_FIELD,
113+
VERIFY_MATCH_ALL_DOCS,
114+
null // without type hint
101115
);
102116

103-
testSearchCase(query, dataset, aggregation -> aggregation.field(KEYWORD_FIELD), agg -> {
104-
assertEquals(9, agg.getBuckets().size());
105-
for (int i = 0; i < 9; i++) {
106-
StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(i);
107-
assertThat(bucket.getKey(), equalTo(String.valueOf(9L - i)));
108-
assertThat(bucket.getDocCount(), equalTo(9L - i));
109-
}
110-
},
111-
ValueType.STRING // with type hint
117+
testSearchCase(
118+
ADD_SORTED_SET_FIELD_NOT_INDEXED,
119+
MATCH_ALL_DOCS_QUERY,
120+
dataset,
121+
CONFIGURE_KEYWORD_FIELD,
122+
VERIFY_MATCH_ALL_DOCS,
123+
ValueType.STRING // with type hint
112124
);
113125
}
114126

115127
private void testSearchCase(
128+
TriConsumer<Document, String, String> addField,
116129
Query query,
117130
List<String> dataset,
118131
Consumer<TermsAggregationBuilder> configure,
@@ -123,7 +136,7 @@ private void testSearchCase(
123136
try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
124137
Document document = new Document();
125138
for (String value : dataset) {
126-
document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(value)));
139+
addField.apply(document, KEYWORD_FIELD, value);
127140
indexWriter.addDocument(document);
128141
document.clear();
129142
}
@@ -147,5 +160,4 @@ private void testSearchCase(
147160
}
148161
}
149162
}
150-
151163
}

0 commit comments

Comments
 (0)