Skip to content

Commit 7e4656f

Browse files
xiaoxmengfacebook-github-bot
authored andcommitted
fix: Back out "fix: Fix the driver block hanging issue in serialized execution mode" (facebookincubator#11681)
Summary: Pull Request resolved: facebookincubator#11681 Original commit changeset: c052fa3de2f4 Original Phabricator Diff: D66438632 Reviewed By: amitkdutta, bikramSingh91, weijiadeng-uber Differential Revision: D66548972 fbshipit-source-id: c36d7230c57b7e90d44e0ce6bd43e6025229d8b4
1 parent e80bf12 commit 7e4656f

File tree

5 files changed

+16
-427
lines changed

5 files changed

+16
-427
lines changed

velox/exec/Driver.cpp

+1-4
Original file line numberDiff line numberDiff line change
@@ -1146,11 +1146,8 @@ std::string blockingReasonToString(BlockingReason reason) {
11461146
return "kYield";
11471147
case BlockingReason::kWaitForArbitration:
11481148
return "kWaitForArbitration";
1149-
default:
1150-
break;
11511149
}
1152-
VELOX_UNREACHABLE(
1153-
fmt::format("Unknown blocking reason {}", static_cast<int>(reason)));
1150+
VELOX_UNREACHABLE();
11541151
}
11551152

11561153
DriverThreadContext* driverThreadContext() {

velox/exec/Task.cpp

+10-95
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,9 @@ RowVectorPtr Task::next(ContinueFuture* future) {
647647
}
648648

649649
VELOX_CHECK_EQ(
650-
state_, TaskState::kRunning, "Task has already finished processing.");
650+
static_cast<int>(state_),
651+
static_cast<int>(kRunning),
652+
"Task has already finished processing.");
651653

652654
// On first call, create the drivers.
653655
if (driverFactories_.empty()) {
@@ -682,11 +684,6 @@ RowVectorPtr Task::next(ContinueFuture* future) {
682684
}
683685

684686
drivers_ = std::move(drivers);
685-
driverBlockingStates_.reserve(drivers_.size());
686-
for (auto i = 0; i < drivers_.size(); ++i) {
687-
driverBlockingStates_.emplace_back(
688-
std::make_unique<DriverBlockingState>(drivers_[i].get()));
689-
}
690687
}
691688

692689
// Run drivers one at a time. If a driver blocks, continue running the other
@@ -701,10 +698,7 @@ RowVectorPtr Task::next(ContinueFuture* future) {
701698
int runnableDrivers = 0;
702699
int blockedDrivers = 0;
703700
for (auto i = 0; i < numDrivers; ++i) {
704-
// Holds a reference to driver for access as async task terminate might
705-
// remove drivers from 'drivers_' slot.
706-
auto driver = getDriver(i);
707-
if (driver == nullptr) {
701+
if (drivers_[i] == nullptr) {
708702
// This driver has finished processing.
709703
continue;
710704
}
@@ -715,25 +709,16 @@ RowVectorPtr Task::next(ContinueFuture* future) {
715709
continue;
716710
}
717711

718-
ContinueFuture blockFuture = ContinueFuture::makeEmpty();
719-
if (driverBlockingStates_[i]->blocked(&blockFuture)) {
720-
VELOX_CHECK(blockFuture.valid());
721-
futures[i] = std::move(blockFuture);
722-
// This driver is still blocked.
723-
++blockedDrivers;
724-
continue;
725-
}
726712
++runnableDrivers;
727713

728714
ContinueFuture driverFuture = ContinueFuture::makeEmpty();
729-
auto result = driver->next(&driverFuture);
730-
if (result != nullptr) {
731-
VELOX_CHECK(!driverFuture.valid());
715+
auto result = drivers_[i]->next(&driverFuture);
716+
if (result) {
732717
return result;
733718
}
734719

735720
if (driverFuture.valid()) {
736-
driverBlockingStates_[i]->setDriverFuture(driverFuture);
721+
futures[i] = std::move(driverFuture);
737722
}
738723

739724
if (error()) {
@@ -743,7 +728,7 @@ RowVectorPtr Task::next(ContinueFuture* future) {
743728

744729
if (runnableDrivers == 0) {
745730
if (blockedDrivers > 0) {
746-
if (future == nullptr) {
731+
if (!future) {
747732
VELOX_FAIL(
748733
"Cannot make progress as all remaining drivers are blocked and user are not expected to wait.");
749734
} else {
@@ -753,20 +738,14 @@ RowVectorPtr Task::next(ContinueFuture* future) {
753738
notReadyFutures.emplace_back(std::move(continueFuture));
754739
}
755740
}
756-
*future = folly::collectAny(std::move(notReadyFutures)).unit();
741+
*future = folly::collectAll(std::move(notReadyFutures)).unit();
757742
}
758743
}
759744
return nullptr;
760745
}
761746
}
762747
}
763748

764-
std::shared_ptr<Driver> Task::getDriver(uint32_t driverId) const {
765-
VELOX_CHECK_LT(driverId, drivers_.size());
766-
std::unique_lock<std::timed_mutex> l(mutex_);
767-
return drivers_[driverId];
768-
}
769-
770749
void Task::start(uint32_t maxDrivers, uint32_t concurrentSplitGroups) {
771750
facebook::velox::process::ThreadDebugInfo threadDebugInfo{
772751
queryCtx()->queryId(), taskId_, nullptr};
@@ -1501,7 +1480,7 @@ void Task::noMoreSplits(const core::PlanNodeId& planNodeId) {
15011480
}
15021481

15031482
if (allFinished) {
1504-
terminate(TaskState::kFinished);
1483+
terminate(kFinished);
15051484
}
15061485
}
15071486

@@ -3123,68 +3102,4 @@ void Task::MemoryReclaimer::abort(
31233102
<< "Timeout waiting for task to complete during query memory aborting.";
31243103
}
31253104
}
3126-
3127-
void Task::DriverBlockingState::setDriverFuture(ContinueFuture& driverFuture) {
3128-
VELOX_CHECK(!blocked_);
3129-
{
3130-
std::lock_guard<std::mutex> l(mutex_);
3131-
VELOX_CHECK(promises_.empty());
3132-
VELOX_CHECK_NULL(error_);
3133-
blocked_ = true;
3134-
}
3135-
std::move(driverFuture)
3136-
.via(&folly::InlineExecutor::instance())
3137-
.thenValue(
3138-
[&, driverHolder = driver_->shared_from_this()](auto&& /* unused */) {
3139-
std::vector<std::unique_ptr<ContinuePromise>> promises;
3140-
{
3141-
std::lock_guard<std::mutex> l(mutex_);
3142-
VELOX_CHECK(blocked_);
3143-
VELOX_CHECK_NULL(error_);
3144-
promises = std::move(promises_);
3145-
blocked_ = false;
3146-
}
3147-
for (auto& promise : promises) {
3148-
promise->setValue();
3149-
}
3150-
})
3151-
.thenError(
3152-
folly::tag_t<std::exception>{},
3153-
[&, driverHolder = driver_->shared_from_this()](
3154-
std::exception const& e) {
3155-
std::lock_guard<std::mutex> l(mutex_);
3156-
VELOX_CHECK(blocked_);
3157-
VELOX_CHECK_NULL(error_);
3158-
try {
3159-
VELOX_FAIL(
3160-
"A driver future from task {} was realized with error: {}",
3161-
driver_->task()->taskId(),
3162-
e.what());
3163-
} catch (const VeloxException&) {
3164-
error_ = std::current_exception();
3165-
}
3166-
blocked_ = false;
3167-
});
3168-
}
3169-
3170-
bool Task::DriverBlockingState::blocked(ContinueFuture* future) {
3171-
VELOX_CHECK_NOT_NULL(future);
3172-
std::lock_guard<std::mutex> l(mutex_);
3173-
if (error_ != nullptr) {
3174-
std::rethrow_exception(error_);
3175-
}
3176-
if (!blocked_) {
3177-
VELOX_CHECK(promises_.empty());
3178-
return false;
3179-
}
3180-
auto [blockPromise, blockFuture] =
3181-
makeVeloxContinuePromiseContract(fmt::format(
3182-
"DriverBlockingState {} from task {}",
3183-
driver_->driverCtx()->driverId,
3184-
driver_->task()->taskId()));
3185-
*future = std::move(blockFuture);
3186-
promises_.emplace_back(
3187-
std::make_unique<ContinuePromise>(std::move(blockPromise)));
3188-
return true;
3189-
}
31903105
} // namespace facebook::velox::exec

velox/exec/Task.h

+2-37
Original file line numberDiff line numberDiff line change
@@ -613,13 +613,13 @@ class Task : public std::enable_shared_from_this<Task> {
613613
/// realized when the last thread stops running for 'this'. This is used to
614614
/// mark cancellation by the user.
615615
ContinueFuture requestCancel() {
616-
return terminate(TaskState::kCanceled);
616+
return terminate(kCanceled);
617617
}
618618

619619
/// Like requestCancel but sets end state to kAborted. This is for stopping
620620
/// Tasks due to failures of other parts of the query.
621621
ContinueFuture requestAbort() {
622-
return terminate(TaskState::kAborted);
622+
return terminate(kAborted);
623623
}
624624

625625
void requestYield() {
@@ -996,8 +996,6 @@ class Task : public std::enable_shared_from_this<Task> {
996996
// trace enabled.
997997
void maybeInitTrace();
998998

999-
std::shared_ptr<Driver> getDriver(uint32_t driverId) const;
1000-
1001999
// Universally unique identifier of the task. Used to identify the task when
10021000
// calling TaskListener.
10031001
const std::string uuid_;
@@ -1069,39 +1067,6 @@ class Task : public std::enable_shared_from_this<Task> {
10691067

10701068
std::vector<std::unique_ptr<DriverFactory>> driverFactories_;
10711069
std::vector<std::shared_ptr<Driver>> drivers_;
1072-
1073-
// Tracks the blocking state for each driver under serialized execution mode.
1074-
class DriverBlockingState {
1075-
public:
1076-
explicit DriverBlockingState(const Driver* driver) : driver_(driver) {
1077-
VELOX_CHECK_NOT_NULL(driver_);
1078-
}
1079-
1080-
/// Sets driver future by setting the continuation callback via inline
1081-
/// executor.
1082-
void setDriverFuture(ContinueFuture& diverFuture);
1083-
1084-
/// Indicates if the associated driver is blocked or not. If blocked,
1085-
/// 'future' is set which becomes realized when the driver is unblocked.
1086-
///
1087-
/// NOTE: the function throws if the driver has encountered error.
1088-
bool blocked(ContinueFuture* future);
1089-
1090-
private:
1091-
const Driver* const driver_;
1092-
1093-
mutable std::mutex mutex_;
1094-
// Indicates if the associated driver is blocked or not.
1095-
bool blocked_{false};
1096-
// Sets the driver future error if not null.
1097-
std::exception_ptr error_{nullptr};
1098-
// Promises to fulfill when the driver is unblocked.
1099-
std::vector<std::unique_ptr<ContinuePromise>> promises_;
1100-
};
1101-
1102-
// Tracks the driver blocking state under serialized execution mode.
1103-
std::vector<std::unique_ptr<DriverBlockingState>> driverBlockingStates_;
1104-
11051070
// When Drivers are closed by the Task, there is a chance that race and/or
11061071
// bugs can cause such Drivers to be held forever, in turn holding a pointer
11071072
// to the Task making it a zombie Tasks. This vector is used to keep track of

velox/exec/TaskStructs.h

+1-27
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,8 @@ class MergeSource;
2727
class MergeJoinSource;
2828
struct Split;
2929

30-
#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY
31-
enum TaskState {
32-
kRunning = 0,
33-
kFinished = 1,
34-
kCanceled = 2,
35-
kAborted = 3,
36-
kFailed = 4
37-
};
38-
#else
3930
/// Corresponds to Presto TaskState, needed for reporting query completion.
40-
enum class TaskState : int {
41-
kRunning = 0,
42-
kFinished = 1,
43-
kCanceled = 2,
44-
kAborted = 3,
45-
kFailed = 4
46-
};
47-
#endif
31+
enum TaskState { kRunning, kFinished, kCanceled, kAborted, kFailed };
4832

4933
std::string taskStateString(TaskState state);
5034

@@ -155,13 +139,3 @@ struct SplitGroupState {
155139
};
156140

157141
} // namespace facebook::velox::exec
158-
159-
template <>
160-
struct fmt::formatter<facebook::velox::exec::TaskState>
161-
: formatter<std::string> {
162-
auto format(facebook::velox::exec::TaskState state, format_context& ctx)
163-
const {
164-
return formatter<std::string>::format(
165-
facebook::velox::exec::taskStateString(state), ctx);
166-
}
167-
};

0 commit comments

Comments
 (0)