35
35
import org .apache .lucene .index .DocValues ;
36
36
import org .apache .lucene .index .IndexReader ;
37
37
import org .apache .lucene .index .LeafReaderContext ;
38
+ import org .apache .lucene .index .NumericDocValues ;
38
39
import org .apache .lucene .index .SortedDocValues ;
39
40
import org .apache .lucene .index .SortedSetDocValues ;
41
+ import org .apache .lucene .index .Terms ;
42
+ import org .apache .lucene .index .TermsEnum ;
43
+ import org .apache .lucene .search .CollectionTerminatedException ;
44
+ import org .apache .lucene .search .Weight ;
40
45
import org .apache .lucene .util .ArrayUtil ;
41
46
import org .apache .lucene .util .BytesRef ;
42
47
import org .apache .lucene .util .PriorityQueue ;
46
51
import org .opensearch .common .util .LongHash ;
47
52
import org .opensearch .core .common .io .stream .StreamOutput ;
48
53
import org .opensearch .core .xcontent .XContentBuilder ;
54
+ import org .opensearch .index .mapper .DocCountFieldMapper ;
49
55
import org .opensearch .search .DocValueFormat ;
50
56
import org .opensearch .search .aggregations .AggregationExecutionException ;
51
57
import org .opensearch .search .aggregations .Aggregator ;
73
79
74
80
import static org .opensearch .search .aggregations .InternalOrder .isKeyOrder ;
75
81
import static org .apache .lucene .index .SortedSetDocValues .NO_MORE_ORDS ;
82
+ import static org .apache .lucene .search .DocIdSetIterator .NO_MORE_DOCS ;
76
83
77
84
/**
78
85
* An aggregator of string values that relies on global ordinals in order to build buckets.
@@ -85,6 +92,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
85
92
86
93
private final LongPredicate acceptedGlobalOrdinals ;
87
94
private final long valueCount ;
95
+ private final String fieldName ;
96
+ private Weight weight ;
88
97
private final GlobalOrdLookupFunction lookupGlobalOrd ;
89
98
protected final CollectionStrategy collectionStrategy ;
90
99
protected int segmentsWithSingleValuedOrds = 0 ;
@@ -136,16 +145,105 @@ public GlobalOrdinalsStringTermsAggregator(
136
145
return new DenseGlobalOrds ();
137
146
});
138
147
}
148
+ this .fieldName = (valuesSource instanceof ValuesSource .Bytes .WithOrdinals .FieldData )
149
+ ? ((ValuesSource .Bytes .WithOrdinals .FieldData ) valuesSource ).getIndexFieldName ()
150
+ : null ;
139
151
}
140
152
141
153
String descriptCollectionStrategy () {
142
154
return collectionStrategy .describe ();
143
155
}
144
156
157
+ public void setWeight (Weight weight ) {
158
+ this .weight = weight ;
159
+ }
160
+
161
+ /**
162
+ Read doc frequencies directly from indexed terms in the segment to skip iterating through individual documents
163
+ @param ctx The LeafReaderContext to collect terms from
164
+ @param globalOrds The SortedSetDocValues for the field's ordinals
165
+ @param ordCountConsumer A consumer to accept collected term frequencies
166
+ @return A LeafBucketCollector implementation with collection termination, since collection is complete
167
+ @throws IOException If an I/O error occurs during reading
168
+ */
169
+ LeafBucketCollector termDocFreqCollector (
170
+ LeafReaderContext ctx ,
171
+ SortedSetDocValues globalOrds ,
172
+ BiConsumer <Long , Integer > ordCountConsumer
173
+ ) throws IOException {
174
+ if (weight == null ) {
175
+ // Weight not assigned - cannot use this optimization
176
+ return null ;
177
+ } else {
178
+ if (weight .count (ctx ) == 0 ) {
179
+ // No documents matches top level query on this segment, we can skip the segment entirely
180
+ return LeafBucketCollector .NO_OP_COLLECTOR ;
181
+ } else if (weight .count (ctx ) != ctx .reader ().maxDoc ()) {
182
+ // weight.count(ctx) == ctx.reader().maxDoc() implies there are no deleted documents and
183
+ // top-level query matches all docs in the segment
184
+ return null ;
185
+ }
186
+ }
187
+
188
+ Terms segmentTerms = ctx .reader ().terms (this .fieldName );
189
+ if (segmentTerms == null ) {
190
+ // Field is not indexed.
191
+ return null ;
192
+ }
193
+
194
+ NumericDocValues docCountValues = DocValues .getNumeric (ctx .reader (), DocCountFieldMapper .NAME );
195
+ if (docCountValues .nextDoc () != NO_MORE_DOCS ) {
196
+ // This segment has at least one document with the _doc_count field.
197
+ return null ;
198
+ }
199
+
200
+ TermsEnum indexTermsEnum = segmentTerms .iterator ();
201
+ BytesRef indexTerm = indexTermsEnum .next ();
202
+ TermsEnum globalOrdinalTermsEnum = globalOrds .termsEnum ();
203
+ BytesRef ordinalTerm = globalOrdinalTermsEnum .next ();
204
+
205
+ // Iterate over the terms in the segment, look for matches in the global ordinal terms,
206
+ // and increment bucket count when segment terms match global ordinal terms.
207
+ while (indexTerm != null && ordinalTerm != null ) {
208
+ int compare = indexTerm .compareTo (ordinalTerm );
209
+ if (compare == 0 ) {
210
+ if (acceptedGlobalOrdinals .test (globalOrdinalTermsEnum .ord ())) {
211
+ ordCountConsumer .accept (globalOrdinalTermsEnum .ord (), indexTermsEnum .docFreq ());
212
+ }
213
+ indexTerm = indexTermsEnum .next ();
214
+ ordinalTerm = globalOrdinalTermsEnum .next ();
215
+ } else if (compare < 0 ) {
216
+ indexTerm = indexTermsEnum .next ();
217
+ } else {
218
+ ordinalTerm = globalOrdinalTermsEnum .next ();
219
+ }
220
+ }
221
+ return new LeafBucketCollector () {
222
+ @ Override
223
+ public void collect (int doc , long owningBucketOrd ) throws IOException {
224
+ throw new CollectionTerminatedException ();
225
+ }
226
+ };
227
+ }
228
+
145
229
@ Override
146
230
public LeafBucketCollector getLeafCollector (LeafReaderContext ctx , LeafBucketCollector sub ) throws IOException {
147
231
SortedSetDocValues globalOrds = valuesSource .globalOrdinalsValues (ctx );
148
232
collectionStrategy .globalOrdsReady (globalOrds );
233
+
234
+ if (collectionStrategy instanceof DenseGlobalOrds
235
+ && this .resultStrategy instanceof StandardTermsResults
236
+ && sub == LeafBucketCollector .NO_OP_COLLECTOR ) {
237
+ LeafBucketCollector termDocFreqCollector = termDocFreqCollector (
238
+ ctx ,
239
+ globalOrds ,
240
+ (ord , docCount ) -> incrementBucketDocCount (collectionStrategy .globalOrdToBucketOrd (0 , ord ), docCount )
241
+ );
242
+ if (termDocFreqCollector != null ) {
243
+ return termDocFreqCollector ;
244
+ }
245
+ }
246
+
149
247
SortedDocValues singleValues = DocValues .unwrapSingleton (globalOrds );
150
248
if (singleValues != null ) {
151
249
segmentsWithSingleValuedOrds ++;
@@ -343,9 +441,20 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol
343
441
final SortedSetDocValues segmentOrds = valuesSource .ordinalsValues (ctx );
344
442
segmentDocCounts = context .bigArrays ().grow (segmentDocCounts , 1 + segmentOrds .getValueCount ());
345
443
assert sub == LeafBucketCollector .NO_OP_COLLECTOR ;
346
- final SortedDocValues singleValues = DocValues .unwrapSingleton (segmentOrds );
347
444
mapping = valuesSource .globalOrdinalsMapping (ctx );
348
- // Dense mode doesn't support include/exclude so we don't have to check it here.
445
+
446
+ if (this .resultStrategy instanceof StandardTermsResults ) {
447
+ LeafBucketCollector termDocFreqCollector = this .termDocFreqCollector (
448
+ ctx ,
449
+ segmentOrds ,
450
+ (ord , docCount ) -> incrementBucketDocCount (mapping .applyAsLong (ord ), docCount )
451
+ );
452
+ if (termDocFreqCollector != null ) {
453
+ return termDocFreqCollector ;
454
+ }
455
+ }
456
+
457
+ final SortedDocValues singleValues = DocValues .unwrapSingleton (segmentOrds );
349
458
if (singleValues != null ) {
350
459
segmentsWithSingleValuedOrds ++;
351
460
return resultStrategy .wrapCollector (new LeafBucketCollectorBase (sub , segmentOrds ) {
0 commit comments