Skip to content

Commit 9f05d71

Browse files
zhli1142015facebook-github-bot
authored andcommitted
feat: Combine low selectivity vectors generated by the hash join filter (facebookincubator#11739)
Summary: This PR re-applies [https://github.com/facebookincubator/velox/issues/10987](https://github.com/facebookincubator/velox/pull/10987), addressing the following issues: 1. Feedback raised in [this comment](facebookincubator#10987 (comment)). 2. Ensuring the loop terminates when outputBytes exceeds `preferredOutputBatchBytes`. With this change we observed 45% perf gain in TPCDS query 72. Pull Request resolved: facebookincubator#11739 Reviewed By: Yuhta Differential Revision: D68024717 Pulled By: xiaoxmeng fbshipit-source-id: 50f0343d00f1c22e829aa1913a3f2f4dc63bcd23
1 parent 47bad1a commit 9f05d71

File tree

5 files changed

+275
-64
lines changed

5 files changed

+275
-64
lines changed

velox/exec/HashProbe.cpp

+94-50
Original file line numberDiff line numberDiff line change
@@ -1059,24 +1059,35 @@ RowVectorPtr HashProbe::getOutputInternal(bool toSpillOutput) {
10591059
// Left semi and anti joins are always cardinality reducing, e.g. for a
10601060
// given row of input they produce zero or 1 row of output. Therefore, if
10611061
// there is no extra filter we can process each batch of input in one go.
1062-
auto outputBatchSize = (isLeftSemiOrAntiJoinNoFilter || emptyBuildSide)
1062+
auto maxOutputBatchRows = (isLeftSemiOrAntiJoinNoFilter || emptyBuildSide)
10631063
? inputSize
10641064
: outputBatchSize_;
1065-
outputTableRowsCapacity_ = outputBatchSize;
1066-
if (filter_ &&
1067-
(isLeftJoin(joinType_) || isFullJoin(joinType_) ||
1068-
isAntiJoin(joinType_) || isLeftSemiFilterJoin(joinType_) ||
1069-
isLeftSemiProjectJoin(joinType_))) {
1070-
// If we need non-matching probe side row, there is a possibility that such
1071-
// row exists at end of an input batch and being carried over in the next
1072-
// output batch, so we need to make extra room of one row in output.
1073-
++outputTableRowsCapacity_;
1065+
outputTableRowsCapacity_ = maxOutputBatchRows;
1066+
if (filter_) {
1067+
if (isLeftJoin(joinType_) || isFullJoin(joinType_) ||
1068+
isAntiJoin(joinType_) || isLeftSemiFilterJoin(joinType_) ||
1069+
isLeftSemiProjectJoin(joinType_)) {
1070+
// If we need non-matching probe side row, there is a possibility that
1071+
// such row exists at end of an input batch and being carried over in the
1072+
// next output batch, so we need to make extra room of one row in output.
1073+
++outputTableRowsCapacity_;
1074+
}
1075+
1076+
// Initialize 'leftSemiProjectIsNull_' for a null-aware left semi join.
1077+
if (isLeftSemiProjectJoin(joinType_) && nullAware_) {
1078+
leftSemiProjectIsNull_.resize(outputTableRowsCapacity_);
1079+
leftSemiProjectIsNull_.clearAll();
1080+
}
10741081
}
1082+
10751083
auto mapping = initializeRowNumberMapping(
10761084
outputRowMapping_, outputTableRowsCapacity_, pool());
10771085
auto* outputTableRows =
10781086
initBuffer<char*>(outputTableRows_, outputTableRowsCapacity_, pool());
10791087

1088+
int numOutputRows = 0;
1089+
uint64_t maxOutputBatchBytes =
1090+
operatorCtx_->driverCtx()->queryConfig().preferredOutputBatchBytes();
10801091
for (;;) {
10811092
// If the task owning this operator has been cancelled, there is no point
10821093
// to continue executing this procedure, which may be long in degenerate
@@ -1085,14 +1096,14 @@ RowVectorPtr HashProbe::getOutputInternal(bool toSpillOutput) {
10851096
if (operatorCtx_->task()->isCancelled()) {
10861097
return nullptr;
10871098
}
1088-
int numOut = 0;
1099+
int numJoinedRows = 0;
10891100

10901101
if (emptyBuildSide) {
10911102
// When build side is empty, anti and left joins return all probe side
10921103
// rows, including ones with null join keys.
10931104
std::iota(mapping.begin(), mapping.begin() + inputSize, 0);
10941105
std::fill(outputTableRows, outputTableRows + inputSize, nullptr);
1095-
numOut = inputSize;
1106+
numJoinedRows = inputSize;
10961107
} else if (isAntiJoin(joinType_) && !filter_) {
10971108
if (nullAware_) {
10981109
// When build side is not empty, anti join without a filter returns
@@ -1101,46 +1112,48 @@ RowVectorPtr HashProbe::getOutputInternal(bool toSpillOutput) {
11011112
for (auto i = 0; i < inputSize; ++i) {
11021113
if (nonNullInputRows_.isValid(i) &&
11031114
(!activeRows_.isValid(i) || !lookup_->hits[i])) {
1104-
mapping[numOut] = i;
1105-
++numOut;
1115+
mapping[numJoinedRows++] = i;
11061116
}
11071117
}
11081118
} else {
11091119
for (auto i = 0; i < inputSize; ++i) {
11101120
if (!nonNullInputRows_.isValid(i) ||
11111121
(!activeRows_.isValid(i) || !lookup_->hits[i])) {
1112-
mapping[numOut] = i;
1113-
++numOut;
1122+
mapping[numJoinedRows++] = i;
11141123
}
11151124
}
11161125
}
11171126
} else {
1118-
numOut = table_->listJoinResults(
1127+
numJoinedRows = table_->listJoinResults(
11191128
*resultIter_,
11201129
joinIncludesMissesFromLeft(joinType_),
1121-
folly::Range(mapping.data(), outputBatchSize),
1122-
folly::Range(outputTableRows, outputBatchSize),
1123-
operatorCtx_->driverCtx()->queryConfig().preferredOutputBatchBytes());
1130+
folly::Range(mapping.data(), maxOutputBatchRows),
1131+
folly::Range(outputTableRows, maxOutputBatchRows),
1132+
maxOutputBatchBytes);
11241133
}
11251134

11261135
// We are done processing the input batch if there are no more joined rows
11271136
// to process and the NoMatchDetector isn't carrying forward a row that
11281137
// still needs to be written to the output.
1129-
if (!numOut && !noMatchDetector_.hasLastMissedRow()) {
1138+
if (!numJoinedRows && !noMatchDetector_.hasLastMissedRow()) {
1139+
if (numOutputRows > 0) {
1140+
fillOutput(numOutputRows);
1141+
input_ = nullptr;
1142+
return output_;
1143+
}
11301144
input_ = nullptr;
11311145
return nullptr;
11321146
}
1133-
VELOX_CHECK_LE(numOut, outputBatchSize);
1147+
VELOX_CHECK_LE(numJoinedRows, maxOutputBatchRows);
1148+
auto numJoinedRowsAfterFilter = evalFilter(numOutputRows, numJoinedRows);
11341149

1135-
numOut = evalFilter(numOut);
1136-
1137-
if (numOut == 0) {
1150+
if (numJoinedRowsAfterFilter == 0) {
11381151
continue;
11391152
}
11401153

11411154
if (needLastProbe()) {
11421155
// Mark build-side rows that have a match on the join condition.
1143-
table_->rows()->setProbedFlag(outputTableRows, numOut);
1156+
table_->rows()->setProbedFlag(outputTableRows, numJoinedRowsAfterFilter);
11441157
}
11451158

11461159
// Right semi join only returns the build side output when the probe side
@@ -1152,7 +1165,35 @@ RowVectorPtr HashProbe::getOutputInternal(bool toSpillOutput) {
11521165
return nullptr;
11531166
}
11541167

1155-
fillOutput(numOut);
1168+
if (numJoinedRowsAfterFilter < numJoinedRows || numOutputRows > 0) {
1169+
numOutputRows += numJoinedRowsAfterFilter;
1170+
// Calculates the estimated size of the output batch in bytes after
1171+
// applying a filter. The estimation is based on the ratio of the number
1172+
// of joined rows to the number of output rows before the filter,
1173+
// multiplied by the size of the output batch in bytes.
1174+
const auto estimatedOutputBatchBytes =
1175+
(1.0 * numJoinedRowsAfterFilter / numJoinedRows) *
1176+
resultIter_->outputBatchBytes;
1177+
// Continue the loop to populate 'outputRowMapping_' and
1178+
// 'outputTableRows_' until either all input rows are processed or the
1179+
// desired row count / max bytes is reached, avoiding low-selectivity
1180+
// vectors.
1181+
if (!resultIter_->atEnd() && numOutputRows < maxOutputBatchRows &&
1182+
estimatedOutputBatchBytes < maxOutputBatchBytes) {
1183+
mapping = folly::Range(
1184+
outputRowMapping_->asMutable<vector_size_t>() + numOutputRows,
1185+
outputTableRowsCapacity_ - numOutputRows);
1186+
outputTableRows = outputTableRows_->asMutable<char*>() + numOutputRows;
1187+
maxOutputBatchRows -= numJoinedRowsAfterFilter;
1188+
maxOutputBatchBytes -= estimatedOutputBatchBytes;
1189+
continue;
1190+
}
1191+
}
1192+
if (numOutputRows > 0) {
1193+
numJoinedRowsAfterFilter = numOutputRows;
1194+
}
1195+
1196+
fillOutput(numJoinedRowsAfterFilter);
11561197

11571198
if (isLeftSemiOrAntiJoinNoFilter || emptyBuildSide) {
11581199
input_ = nullptr;
@@ -1177,7 +1218,15 @@ bool HashProbe::maybeReadSpillOutput() {
11771218
return true;
11781219
}
11791220

1180-
RowVectorPtr HashProbe::createFilterInput(vector_size_t size) {
1221+
RowVectorPtr HashProbe::createFilterInput(
1222+
vector_size_t offset,
1223+
vector_size_t size) {
1224+
BufferPtr outputRowMapping = outputRowMapping_;
1225+
if (offset > 0) {
1226+
VELOX_CHECK_LE(size, outputTableRowsCapacity_ - offset);
1227+
outputRowMapping = Buffer::slice<vector_size_t>(
1228+
outputRowMapping_, offset, outputTableRowsCapacity_ - offset, pool());
1229+
}
11811230
std::vector<VectorPtr> filterColumns(filterInputType_->size());
11821231
for (auto projection : filterInputProjections_) {
11831232
if (projectedInputColumns_.find(projection.inputChannel) !=
@@ -1194,12 +1243,12 @@ RowVectorPtr HashProbe::createFilterInput(vector_size_t size) {
11941243
}
11951244

11961245
filterColumns[projection.outputChannel] = wrapChild(
1197-
size, outputRowMapping_, input_->childAt(projection.inputChannel));
1246+
size, outputRowMapping, input_->childAt(projection.inputChannel));
11981247
}
11991248

12001249
extractColumns(
12011250
table_.get(),
1202-
folly::Range<char* const*>(outputTableRows_->as<char*>(), size),
1251+
folly::Range<char* const*>(outputTableRows_->as<char*>() + offset, size),
12031252
filterTableProjections_,
12041253
pool(),
12051254
filterInputType_->children(),
@@ -1212,7 +1261,8 @@ RowVectorPtr HashProbe::createFilterInput(vector_size_t size) {
12121261
void HashProbe::prepareFilterRowsForNullAwareJoin(
12131262
RowVectorPtr& filterInput,
12141263
vector_size_t numRows,
1215-
bool filterPropagateNulls) {
1264+
bool filterPropagateNulls,
1265+
vector_size_t* rawOutputProbeRowMapping) {
12161266
VELOX_CHECK_LE(numRows, kBatchSize);
12171267
if (filterTableInput_ == nullptr) {
12181268
filterTableInput_ =
@@ -1255,10 +1305,9 @@ void HashProbe::prepareFilterRowsForNullAwareJoin(
12551305
// with null join key columns(s) as we can apply filtering after they cross
12561306
// join with the table rows later.
12571307
if (!nonNullInputRows_.isAllSelected()) {
1258-
auto* rawMapping = outputRowMapping_->asMutable<vector_size_t>();
12591308
for (int i = 0; i < numRows; ++i) {
12601309
if (filterInputRows_.isValid(i) &&
1261-
!nonNullInputRows_.isValid(rawMapping[i])) {
1310+
!nonNullInputRows_.isValid(rawOutputProbeRowMapping[i])) {
12621311
filterInputRows_.setValid(i, false);
12631312
}
12641313
}
@@ -1345,10 +1394,8 @@ void HashProbe::applyFilterOnTableRowsForNullAwareJoin(
13451394

13461395
SelectivityVector HashProbe::evalFilterForNullAwareJoin(
13471396
vector_size_t numRows,
1348-
bool filterPropagateNulls) {
1349-
auto* rawOutputProbeRowMapping =
1350-
outputRowMapping_->asMutable<vector_size_t>();
1351-
1397+
bool filterPropagateNulls,
1398+
vector_size_t* rawOutputProbeRowMapping) {
13521399
// Subset of probe-side rows with a match that passed the filter.
13531400
SelectivityVector filterPassedRows(input_->size(), false);
13541401

@@ -1417,15 +1464,15 @@ void HashProbe::prepareNullKeyProbeHashers() {
14171464
}
14181465
}
14191466

1420-
int32_t HashProbe::evalFilter(int32_t numRows) {
1467+
int32_t HashProbe::evalFilter(int32_t offset, int32_t numRows) {
14211468
if (!filter_) {
14221469
return numRows;
14231470
}
14241471

14251472
const bool filterPropagateNulls = filter_->expr(0)->propagatesNulls();
14261473
auto* rawOutputProbeRowMapping =
1427-
outputRowMapping_->asMutable<vector_size_t>();
1428-
auto* outputTableRows = outputTableRows_->asMutable<char*>();
1474+
outputRowMapping_->asMutable<vector_size_t>() + offset;
1475+
auto* outputTableRows = outputTableRows_->asMutable<char*>() + offset;
14291476

14301477
filterInputRows_.resizeFill(numRows);
14311478

@@ -1443,11 +1490,11 @@ int32_t HashProbe::evalFilter(int32_t numRows) {
14431490
filterInputRows_.updateBounds();
14441491
}
14451492

1446-
RowVectorPtr filterInput = createFilterInput(numRows);
1493+
RowVectorPtr filterInput = createFilterInput(offset, numRows);
14471494

14481495
if (nullAware_) {
14491496
prepareFilterRowsForNullAwareJoin(
1450-
filterInput, numRows, filterPropagateNulls);
1497+
filterInput, numRows, filterPropagateNulls, rawOutputProbeRowMapping);
14511498
}
14521499

14531500
EvalCtx evalCtx(operatorCtx_->execCtx(), filter_.get(), filterInput.get());
@@ -1525,21 +1572,18 @@ int32_t HashProbe::evalFilter(int32_t numRows) {
15251572
static const char* kPassed = "passed";
15261573

15271574
if (nullAware_) {
1528-
leftSemiProjectIsNull_.resize(numRows);
1529-
leftSemiProjectIsNull_.clearAll();
1530-
15311575
auto addLast = [&](auto row, std::optional<bool> passed) {
15321576
if (passed.has_value()) {
15331577
outputTableRows[numPassed] =
15341578
passed.value() ? const_cast<char*>(kPassed) : nullptr;
15351579
} else {
1536-
leftSemiProjectIsNull_.setValid(numPassed, true);
1580+
leftSemiProjectIsNull_.setValid(numPassed + offset, true);
15371581
}
15381582
rawOutputProbeRowMapping[numPassed++] = row;
15391583
};
15401584

1541-
auto passedRows =
1542-
evalFilterForNullAwareJoin(numRows, filterPropagateNulls);
1585+
auto passedRows = evalFilterForNullAwareJoin(
1586+
numRows, filterPropagateNulls, rawOutputProbeRowMapping);
15431587
for (auto i = 0; i < numRows; ++i) {
15441588
// filterPassed(i) -> TRUE
15451589
// else passed -> NULL
@@ -1575,8 +1619,8 @@ int32_t HashProbe::evalFilter(int32_t numRows) {
15751619
rawOutputProbeRowMapping[numPassed++] = row;
15761620
};
15771621
if (nullAware_) {
1578-
auto passedRows =
1579-
evalFilterForNullAwareJoin(numRows, filterPropagateNulls);
1622+
auto passedRows = evalFilterForNullAwareJoin(
1623+
numRows, filterPropagateNulls, rawOutputProbeRowMapping);
15801624
for (auto i = 0; i < numRows; ++i) {
15811625
auto probeRow = rawOutputProbeRowMapping[i];
15821626
bool passed = passedRows.isValid(probeRow);

velox/exec/HashProbe.h

+8-6
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ class HashProbe : public Operator {
136136
// for right join and full join.
137137
RowVectorPtr getBuildSideOutput();
138138

139-
// Applies 'filter_' to 'outputTableRows_' and updates 'outputRowMapping_'.
140-
// Returns the number of passing rows.
141-
vector_size_t evalFilter(vector_size_t numRows);
139+
// Apply 'filter_' to 'outputTableRows_' from 'offset' for 'numRows' entries,
140+
// updating 'outputRowMapping_'. Returns the number of passing rows.
141+
vector_size_t evalFilter(vector_size_t offset, vector_size_t numRows);
142142

143143
inline bool filterPassed(vector_size_t row) {
144144
return filterInputRows_.isValid(row) &&
@@ -149,7 +149,7 @@ class HashProbe : public Operator {
149149
// Create a temporary input vector to be passed to the filter. This ensures it
150150
// gets destroyed in case its wrapping an unloaded vector which eventually
151151
// needs to be wrapped in fillOutput().
152-
RowVectorPtr createFilterInput(vector_size_t size);
152+
RowVectorPtr createFilterInput(vector_size_t offset, vector_size_t size);
153153

154154
// Prepare filter row selectivity for null-aware join. 'numRows'
155155
// specifies the number of rows in 'filterInputRows_' to process. If
@@ -158,12 +158,14 @@ class HashProbe : public Operator {
158158
void prepareFilterRowsForNullAwareJoin(
159159
RowVectorPtr& filterInput,
160160
vector_size_t numRows,
161-
bool filterPropagateNulls);
161+
bool filterPropagateNulls,
162+
vector_size_t* rawOutputProbeRowMapping);
162163

163164
// Evaluate the filter for null-aware anti or left semi project join.
164165
SelectivityVector evalFilterForNullAwareJoin(
165166
vector_size_t numRows,
166-
bool filterPropagateNulls);
167+
bool filterPropagateNulls,
168+
vector_size_t* rawOutputProbeRowMapping);
167169

168170
// Prepares the hashers for probing with null keys.
169171
// Initializes `nullKeyProbeHashers_` if empty, ensuring it has exactly one

velox/exec/HashTable.cpp

+9-8
Original file line numberDiff line numberDiff line change
@@ -1840,7 +1840,7 @@ int32_t HashTable<ignoreNullKeys>::listJoinResults(
18401840
folly::Range<char**> hits,
18411841
uint64_t maxBytes) {
18421842
VELOX_CHECK_LE(inputRows.size(), hits.size());
1843-
1843+
iter.outputBatchBytes = 0;
18441844
if (iter.estimatedRowSize.has_value() && !hasDuplicates_) {
18451845
// When there is no duplicates, and row size is estimable, we are able to
18461846
// go through fast path.
@@ -1850,7 +1850,6 @@ int32_t HashTable<ignoreNullKeys>::listJoinResults(
18501850

18511851
size_t numOut = 0;
18521852
auto maxOut = inputRows.size();
1853-
uint64_t totalBytes{0};
18541853
while (iter.lastRowIndex < iter.rows->size()) {
18551854
auto row = (*iter.rows)[iter.lastRowIndex];
18561855
auto hit = (*iter.hits)[row]; // NOLINT
@@ -1873,7 +1872,7 @@ int32_t HashTable<ignoreNullKeys>::listJoinResults(
18731872
hits[numOut] = hit;
18741873
numOut++;
18751874
iter.lastRowIndex++;
1876-
totalBytes += iter.estimatedRowSize.has_value()
1875+
iter.outputBatchBytes += iter.estimatedRowSize.has_value()
18771876
? iter.estimatedRowSize.value()
18781877
: (joinProjectedVarColumnsSize(iter.varSizeListColumns, hit) +
18791878
iter.fixedSizeListColumnsSizeSum);
@@ -1889,19 +1888,20 @@ int32_t HashTable<ignoreNullKeys>::listJoinResults(
18891888
iter.lastDuplicateRowIndex += num;
18901889
numOut += num;
18911890
if (iter.estimatedRowSize.has_value()) {
1892-
totalBytes += iter.estimatedRowSize.value() * numRows;
1891+
iter.outputBatchBytes += iter.estimatedRowSize.value() * numRows;
18931892
} else {
1894-
totalBytes +=
1893+
iter.outputBatchBytes +=
18951894
joinProjectedVarColumnsSize(iter.varSizeListColumns, rows);
1896-
totalBytes += (iter.fixedSizeListColumnsSizeSum * rows->size());
1897-
totalBytes += (iter.fixedSizeListColumnsSizeSum * numRows);
1895+
iter.outputBatchBytes +=
1896+
(iter.fixedSizeListColumnsSizeSum * rows->size());
1897+
iter.outputBatchBytes += (iter.fixedSizeListColumnsSizeSum * numRows);
18981898
}
18991899
if (iter.lastDuplicateRowIndex >= numRows) {
19001900
iter.lastDuplicateRowIndex = 0;
19011901
iter.lastRowIndex++;
19021902
}
19031903
}
1904-
if (numOut >= maxOut || totalBytes >= maxBytes) {
1904+
if (numOut >= maxOut || iter.outputBatchBytes >= maxBytes) {
19051905
return numOut;
19061906
}
19071907
}
@@ -1965,6 +1965,7 @@ int32_t HashTable<ignoreNullKeys>::listJoinResultsFastPath(
19651965
}
19661966

19671967
iter.lastRowIndex = i;
1968+
iter.outputBatchBytes += numOut * iter.estimatedRowSize.value();
19681969
return numOut;
19691970
}
19701971

0 commit comments

Comments
 (0)