Skip to content

Commit 4a6f12f

Browse files
committed
fix: positional delete bug when base row number is greater than the largest
delete position
1 parent ebfb1e5 commit 4a6f12f

File tree

2 files changed

+34
-13
lines changed

2 files changed

+34
-13
lines changed

velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,16 @@ void PositionalDeleteFileReader::updateDeleteBitmap(
223223
deletePositionsVector->as<FlatVector<int64_t>>()->rawValues();
224224
int64_t offset = baseReadOffset + splitOffset_;
225225

226+
// If the offset is greater than the last position in this delete rows batch,
227+
// nothing to delete.
228+
if (offset > deletePositions[deletePositionsVector->size() - 1]) {
229+
return;
230+
}
231+
232+
while (deletePositionsOffset_ < deletePositionsVector->size() &&
233+
deletePositions[deletePositionsOffset_] < offset) {
234+
deletePositionsOffset_++;
235+
}
226236
while (deletePositionsOffset_ < deletePositionsVector->size() &&
227237
deletePositions[deletePositionsOffset_] < rowNumberUpperBound) {
228238
bits::setBit(

velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp

+24-13
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ class HiveIcebergTest : public HiveConnectorTestBase {
209209
}
210210
}
211211

212-
splits.emplace_back(makeIcebergSplit(baseFilePath, deleteFiles));
212+
auto icebergSplits = makeIcebergSplit(baseFilePath, 3, deleteFiles);
213+
splits.insert(splits.end(), icebergSplits.begin(), icebergSplits.end());
213214
}
214215

215216
std::string duckdbSql =
@@ -335,8 +336,9 @@ class HiveIcebergTest : public HiveConnectorTestBase {
335336
return vectors;
336337
}
337338

338-
std::shared_ptr<ConnectorSplit> makeIcebergSplit(
339+
std::vector<std::shared_ptr<ConnectorSplit>> makeIcebergSplit(
339340
const std::string& dataFilePath,
341+
const uint32_t splitCount = 1,
340342
const std::vector<IcebergDeleteFile>& deleteFiles = {}) {
341343
std::unordered_map<std::string, std::optional<std::string>> partitionKeys;
342344
std::unordered_map<std::string, std::string> customSplitInfo;
@@ -346,17 +348,24 @@ class HiveIcebergTest : public HiveConnectorTestBase {
346348
->openFileForRead(dataFilePath);
347349
const int64_t fileSize = file->size();
348350

349-
return std::make_shared<HiveIcebergSplit>(
350-
kHiveConnectorId,
351-
dataFilePath,
352-
fileFomat_,
353-
0,
354-
fileSize,
355-
partitionKeys,
356-
std::nullopt,
357-
customSplitInfo,
358-
nullptr,
359-
deleteFiles);
351+
std::vector<std::shared_ptr<ConnectorSplit>> splits;
352+
const uint64_t splitSize = std::ceil((fileSize) / splitCount);
353+
354+
for (int i = 0; i < splitCount; ++i) {
355+
splits.emplace_back(std::make_shared<HiveIcebergSplit>(
356+
kHiveConnectorId,
357+
dataFilePath,
358+
fileFomat_,
359+
i * splitSize,
360+
splitSize,
361+
partitionKeys,
362+
std::nullopt,
363+
customSplitInfo,
364+
nullptr,
365+
deleteFiles));
366+
}
367+
368+
return splits;
360369
}
361370

362371
std::string getDuckDBQuery(
@@ -657,6 +666,8 @@ TEST_F(HiveIcebergTest, positionalDeletesMultipleSplits) {
657666
assertMultipleSplits(makeRandomIncreasingValues(0, 20000), 10, 3);
658667
assertMultipleSplits(makeContinuousIncreasingValues(0, 20000), 10, 3);
659668
assertMultipleSplits({}, 10, 3);
669+
670+
assertMultipleSplits(makeContinuousIncreasingValues(0, 5000), 1, 3);
660671
}
661672

662673
} // namespace facebook::velox::connector::hive::iceberg

0 commit comments

Comments
 (0)