Skip to content

Commit 8519fad

Browse files
Daniel Huntefacebook-github-bot
Daniel Hunte
authored andcommitted
feat(fuzzer): Support multiple joins in the join node "toSql" methods for reference query runners (facebookincubator#11801)
Summary: Pull Request resolved: facebookincubator#11801 Currently, the hash join and nested loop join "toSql" methods for all reference query runners only support a single join. This change extends it to support multiple joins, only needing the join node of the last join in the tree. It traverses up the tree and recursively builds the sql query. Reviewed By: kevinwilfong Differential Revision: D66977480 fbshipit-source-id: 24355736dfd1a267435d6c36dcb06580c1e62537
1 parent c385e6c commit 8519fad

12 files changed

+532
-407
lines changed

velox/core/PlanNode.h

+2
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,8 @@ class ValuesNode : public PlanNode {
324324
const size_t repeatTimes_;
325325
};
326326

327+
using ValuesNodePtr = std::shared_ptr<const ValuesNode>;
328+
327329
class ArrowStreamNode : public PlanNode {
328330
public:
329331
ArrowStreamNode(

velox/exec/fuzzer/CMakeLists.txt

+7-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
add_library(velox_fuzzer_util DuckQueryRunner.cpp PrestoQueryRunner.cpp
16-
FuzzerUtil.cpp ToSQLUtil.cpp)
15+
add_library(
16+
velox_fuzzer_util
17+
ReferenceQueryRunner.cpp
18+
DuckQueryRunner.cpp
19+
PrestoQueryRunner.cpp
20+
FuzzerUtil.cpp
21+
ToSQLUtil.cpp)
1722

1823
target_link_libraries(
1924
velox_fuzzer_util

velox/exec/fuzzer/DuckQueryRunner.cpp

+37-144
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
17+
#include <optional>
18+
#include <set>
19+
#include <unordered_map>
20+
1621
#include "velox/exec/fuzzer/DuckQueryRunner.h"
1722
#include "velox/exec/fuzzer/ToSQLUtil.h"
1823
#include "velox/exec/tests/utils/QueryAssertions.h"
@@ -102,23 +107,39 @@ DuckQueryRunner::aggregationFunctionDataSpecs() const {
102107
return kAggregationFunctionDataSpecs;
103108
}
104109

105-
std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
106-
const std::string& sql,
107-
const std::vector<RowVectorPtr>& input,
108-
const RowTypePtr& resultType) {
109-
DuckDbQueryRunner queryRunner;
110-
queryRunner.createTable("tmp", input);
111-
return queryRunner.execute(sql, resultType);
110+
std::pair<
111+
std::optional<std::multiset<std::vector<velox::variant>>>,
112+
ReferenceQueryErrorCode>
113+
DuckQueryRunner::execute(const core::PlanNodePtr& plan) {
114+
if (std::optional<std::string> sql = toSql(plan)) {
115+
try {
116+
DuckDbQueryRunner queryRunner;
117+
std::unordered_map<std::string, std::vector<RowVectorPtr>> inputMap =
118+
getAllTables(plan);
119+
for (const auto& [tableName, input] : inputMap) {
120+
queryRunner.createTable(tableName, input);
121+
}
122+
return std::make_pair(
123+
queryRunner.execute(*sql, plan->outputType()),
124+
ReferenceQueryErrorCode::kSuccess);
125+
} catch (...) {
126+
LOG(WARNING) << "Query failed in DuckDB";
127+
return std::make_pair(
128+
std::nullopt, ReferenceQueryErrorCode::kReferenceQueryFail);
129+
}
130+
}
131+
132+
LOG(INFO) << "Query not supported in DuckDB";
133+
return std::make_pair(
134+
std::nullopt, ReferenceQueryErrorCode::kReferenceQueryUnsupported);
112135
}
113136

114137
std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
115138
const std::string& sql,
116-
const std::vector<RowVectorPtr>& probeInput,
117-
const std::vector<RowVectorPtr>& buildInput,
139+
const std::vector<RowVectorPtr>& input,
118140
const RowTypePtr& resultType) {
119141
DuckDbQueryRunner queryRunner;
120-
queryRunner.createTable("t", probeInput);
121-
queryRunner.createTable("u", buildInput);
142+
queryRunner.createTable("tmp", input);
122143
return queryRunner.execute(sql, resultType);
123144
}
124145

@@ -164,6 +185,11 @@ std::optional<std::string> DuckQueryRunner::toSql(
164185
return toSql(joinNode);
165186
}
166187

188+
if (const auto valuesNode =
189+
std::dynamic_pointer_cast<const core::ValuesNode>(plan)) {
190+
return toSql(valuesNode);
191+
}
192+
167193
VELOX_NYI();
168194
}
169195

@@ -340,137 +366,4 @@ std::optional<std::string> DuckQueryRunner::toSql(
340366

341367
return sql.str();
342368
}
343-
344-
std::optional<std::string> DuckQueryRunner::toSql(
345-
const std::shared_ptr<const core::HashJoinNode>& joinNode) {
346-
const auto& joinKeysToSql = [](auto keys) {
347-
std::stringstream out;
348-
for (auto i = 0; i < keys.size(); ++i) {
349-
if (i > 0) {
350-
out << ", ";
351-
}
352-
out << keys[i]->name();
353-
}
354-
return out.str();
355-
};
356-
357-
const auto filterToSql = [](core::TypedExprPtr filter) {
358-
auto call = std::dynamic_pointer_cast<const core::CallTypedExpr>(filter);
359-
return toCallSql(call);
360-
};
361-
362-
const auto& joinConditionAsSql = [&](auto joinNode) {
363-
std::stringstream out;
364-
for (auto i = 0; i < joinNode->leftKeys().size(); ++i) {
365-
if (i > 0) {
366-
out << " AND ";
367-
}
368-
out << joinNode->leftKeys()[i]->name() << " = "
369-
<< joinNode->rightKeys()[i]->name();
370-
}
371-
if (joinNode->filter()) {
372-
out << " AND " << filterToSql(joinNode->filter());
373-
}
374-
return out.str();
375-
};
376-
377-
const auto& outputNames = joinNode->outputType()->names();
378-
379-
std::stringstream sql;
380-
if (joinNode->isLeftSemiProjectJoin()) {
381-
sql << "SELECT "
382-
<< folly::join(", ", outputNames.begin(), --outputNames.end());
383-
} else {
384-
sql << "SELECT " << folly::join(", ", outputNames);
385-
}
386-
387-
switch (joinNode->joinType()) {
388-
case core::JoinType::kInner:
389-
sql << " FROM t INNER JOIN u ON " << joinConditionAsSql(joinNode);
390-
break;
391-
case core::JoinType::kLeft:
392-
sql << " FROM t LEFT JOIN u ON " << joinConditionAsSql(joinNode);
393-
break;
394-
case core::JoinType::kFull:
395-
sql << " FROM t FULL OUTER JOIN u ON " << joinConditionAsSql(joinNode);
396-
break;
397-
case core::JoinType::kLeftSemiFilter:
398-
// Multiple columns returned by a scalar subquery is not supported in
399-
// DuckDB. A scalar subquery expression is a subquery that returns one
400-
// result row from exactly one column for every input row.
401-
if (joinNode->leftKeys().size() > 1) {
402-
return std::nullopt;
403-
}
404-
sql << " FROM t WHERE " << joinKeysToSql(joinNode->leftKeys())
405-
<< " IN (SELECT " << joinKeysToSql(joinNode->rightKeys())
406-
<< " FROM u";
407-
if (joinNode->filter()) {
408-
sql << " WHERE " << filterToSql(joinNode->filter());
409-
}
410-
sql << ")";
411-
break;
412-
case core::JoinType::kLeftSemiProject:
413-
if (joinNode->isNullAware()) {
414-
sql << ", " << joinKeysToSql(joinNode->leftKeys()) << " IN (SELECT "
415-
<< joinKeysToSql(joinNode->rightKeys()) << " FROM u";
416-
if (joinNode->filter()) {
417-
sql << " WHERE " << filterToSql(joinNode->filter());
418-
}
419-
sql << ") FROM t";
420-
} else {
421-
sql << ", EXISTS (SELECT * FROM u WHERE "
422-
<< joinConditionAsSql(joinNode);
423-
sql << ") FROM t";
424-
}
425-
break;
426-
case core::JoinType::kAnti:
427-
if (joinNode->isNullAware()) {
428-
sql << " FROM t WHERE " << joinKeysToSql(joinNode->leftKeys())
429-
<< " NOT IN (SELECT " << joinKeysToSql(joinNode->rightKeys())
430-
<< " FROM u";
431-
if (joinNode->filter()) {
432-
sql << " WHERE " << filterToSql(joinNode->filter());
433-
}
434-
sql << ")";
435-
} else {
436-
sql << " FROM t WHERE NOT EXISTS (SELECT * FROM u WHERE "
437-
<< joinConditionAsSql(joinNode);
438-
sql << ")";
439-
}
440-
break;
441-
default:
442-
VELOX_UNREACHABLE(
443-
"Unknown join type: {}", static_cast<int>(joinNode->joinType()));
444-
}
445-
446-
return sql.str();
447-
}
448-
449-
std::optional<std::string> DuckQueryRunner::toSql(
450-
const std::shared_ptr<const core::NestedLoopJoinNode>& joinNode) {
451-
std::stringstream sql;
452-
sql << "SELECT " << folly::join(", ", joinNode->outputType()->names());
453-
454-
// Nested loop join without filter.
455-
VELOX_CHECK(
456-
joinNode->joinCondition() == nullptr,
457-
"This code path should be called only for nested loop join without filter");
458-
const std::string joinCondition{"(1 = 1)"};
459-
switch (joinNode->joinType()) {
460-
case core::JoinType::kInner:
461-
sql << " FROM t INNER JOIN u ON " << joinCondition;
462-
break;
463-
case core::JoinType::kLeft:
464-
sql << " FROM t LEFT JOIN u ON " << joinCondition;
465-
break;
466-
case core::JoinType::kFull:
467-
sql << " FROM t FULL OUTER JOIN u ON " << joinCondition;
468-
break;
469-
default:
470-
VELOX_UNREACHABLE(
471-
"Unknown join type: {}", static_cast<int>(joinNode->joinType()));
472-
}
473-
474-
return sql.str();
475-
}
476369
} // namespace facebook::velox::exec::test

velox/exec/fuzzer/DuckQueryRunner.h

+13-12
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
*/
1616
#pragma once
1717

18+
#include <optional>
19+
#include <set>
20+
#include <unordered_map>
21+
1822
#include "velox/exec/fuzzer/ReferenceQueryRunner.h"
1923

2024
namespace facebook::velox::exec::test {
@@ -46,20 +50,23 @@ class DuckQueryRunner : public ReferenceQueryRunner {
4650
/// Assumes that source of AggregationNode or Window Node is 'tmp' table.
4751
std::optional<std::string> toSql(const core::PlanNodePtr& plan) override;
4852

53+
/// Executes the plan and returns the result along with success or fail error
54+
/// code.
55+
std::pair<
56+
std::optional<std::multiset<std::vector<velox::variant>>>,
57+
ReferenceQueryErrorCode>
58+
execute(const core::PlanNodePtr& plan) override;
59+
4960
/// Creates 'tmp' table with 'input' data and runs 'sql' query. Returns
5061
/// results according to 'resultType' schema.
5162
std::multiset<std::vector<velox::variant>> execute(
5263
const std::string& sql,
5364
const std::vector<RowVectorPtr>& input,
5465
const RowTypePtr& resultType) override;
5566

56-
std::multiset<std::vector<velox::variant>> execute(
57-
const std::string& sql,
58-
const std::vector<RowVectorPtr>& probeInput,
59-
const std::vector<RowVectorPtr>& buildInput,
60-
const RowTypePtr& resultType) override;
61-
6267
private:
68+
using ReferenceQueryRunner::toSql;
69+
6370
std::optional<std::string> toSql(
6471
const std::shared_ptr<const core::AggregationNode>& aggregationNode);
6572

@@ -72,12 +79,6 @@ class DuckQueryRunner : public ReferenceQueryRunner {
7279
std::optional<std::string> toSql(
7380
const std::shared_ptr<const core::RowNumberNode>& rowNumberNode);
7481

75-
std::optional<std::string> toSql(
76-
const std::shared_ptr<const core::HashJoinNode>& joinNode);
77-
78-
std::optional<std::string> toSql(
79-
const std::shared_ptr<const core::NestedLoopJoinNode>& joinNode);
80-
8182
std::unordered_set<std::string> aggregateFunctionNames_;
8283
};
8384

velox/exec/fuzzer/FuzzerUtil.h

-6
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,6 @@ void setupMemory(
131131
void registerHiveConnector(
132132
const std::unordered_map<std::string, std::string>& hiveConfigs);
133133

134-
enum ReferenceQueryErrorCode {
135-
kSuccess,
136-
kReferenceQueryFail,
137-
kReferenceQueryUnsupported
138-
};
139-
140134
// Converts 'plan' into an SQL query and runs it on 'input' in the reference DB.
141135
// Result is returned as a MaterializedRowMultiset with the
142136
// ReferenceQueryErrorCode::kSuccess if successful, or an std::nullopt with a

velox/exec/fuzzer/JoinFuzzer.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -679,12 +679,12 @@ std::optional<MaterializedRowMultiset> JoinFuzzer::computeReferenceResults(
679679
VELOX_CHECK(!containsUnsupportedTypes(buildInput[0]->type()));
680680
}
681681

682-
if (auto sql = referenceQueryRunner_->toSql(plan)) {
683-
return referenceQueryRunner_->execute(
684-
sql.value(), probeInput, buildInput, plan->outputType());
682+
auto result = referenceQueryRunner_->execute(plan);
683+
if (result.first) {
684+
return result.first;
685685
}
686686

687-
LOG(INFO) << "Query not supported by the reference DB";
687+
LOG(INFO) << "Query not supported by or failed in the reference DB";
688688
return std::nullopt;
689689
}
690690

0 commit comments

Comments
 (0)