Skip to content

Commit 8c8e73d

Browse files
Yuhtafacebook-github-bot
authored andcommitted
Optimize readWithVisitor for RleEncoding and NullableEncoding (facebookincubator#58)
Summary: X-link: facebookincubator/velox#9896 Pull Request resolved: facebookincubator#58 - Add fash path for `RleEncoding::readWithVisitor` - Use `materializeBoolsAsBits` in `NullableEncoding::readWithVisitor` - Merge `ChunkedBoolsDecoder` with `ChunkedDecoder` - Optimize the data type dispatch in `EncodingUtils.h` to improve compilation time bypass-github-export-checks Reviewed By: oerling Differential Revision: D57675525 fbshipit-source-id: 419c24d81007b22d92a555e1648ac97077aaeecb
1 parent d924245 commit 8c8e73d

File tree

5 files changed

+384
-159
lines changed

5 files changed

+384
-159
lines changed

dwio/nimble/encodings/DictionaryEncoding.h

+27-14
Original file line numberDiff line numberDiff line change
@@ -126,34 +126,40 @@ void DictionaryEncoding<T>::materialize(uint32_t rowCount, void* buffer) {
126126

127127
namespace detail {
128128

129-
class ExtractDictionaryIndices {
129+
class DictionaryIndicesHook : public velox::ValueHook {
130130
public:
131131
static constexpr bool kSkipNulls = true;
132-
using HookType = velox::dwio::common::NoHook;
133132

134-
ExtractDictionaryIndices(uint32_t* indices, velox::vector_size_t offset)
133+
DictionaryIndicesHook(uint32_t* indices, vector_size_t offset)
135134
: indices_(indices), offset_(offset) {}
136135

137-
bool acceptsNulls() const {
136+
bool acceptsNulls() const final {
138137
return false;
139138
}
140139

141-
void addValue(velox::vector_size_t i, uint32_t value) {
142-
indices_[i - offset_] = value;
140+
void addValue(vector_size_t i, const void* value) final {
141+
indices_[i - offset_] = *reinterpret_cast<const uint32_t*>(value);
143142
}
144143

145-
template <typename T>
146-
void addNull(velox::vector_size_t /*i*/) {
147-
NIMBLE_UNREACHABLE(__PRETTY_FUNCTION__);
144+
void addValues(
145+
const vector_size_t* rows,
146+
const void* values,
147+
vector_size_t size,
148+
uint8_t valueWidth) final {
149+
NIMBLE_DASSERT(valueWidth == sizeof(uint32_t), "");
150+
auto* indices = reinterpret_cast<const uint32_t*>(values);
151+
for (vector_size_t i = 0; i < size; ++i) {
152+
indices_[rows[i] - offset_] = indices[i];
153+
}
148154
}
149155

150-
HookType& hook() {
151-
return velox::dwio::common::noHook();
156+
void addNull(vector_size_t /*i*/) final {
157+
NIMBLE_UNREACHABLE(__PRETTY_FUNCTION__);
152158
}
153159

154160
private:
155161
uint32_t* const indices_;
156-
const velox::vector_size_t offset_;
162+
const vector_size_t offset_;
157163
};
158164

159165
} // namespace detail
@@ -163,18 +169,25 @@ template <typename V>
163169
void DictionaryEncoding<T>::readWithVisitor(
164170
V& visitor,
165171
ReadWithVisitorParams& params) {
172+
if constexpr (sizeof(T) < sizeof(uint32_t)) {
173+
// Column reader values buffer is not large enough to hold indices in this
174+
// case.
175+
NIMBLE_UNREACHABLE(typeid(T).name());
176+
}
166177
const auto startRowIndex = visitor.rowIndex();
167178
buffer_.resize(visitor.numRows() - startRowIndex);
168179
velox::common::AlwaysTrue indicesFilter;
180+
detail::DictionaryIndicesHook indicesHook(buffer_.data(), startRowIndex);
169181
auto indicesVisitor = DecoderVisitor<
170182
int32_t,
171183
velox::common::AlwaysTrue,
172-
detail::ExtractDictionaryIndices,
184+
velox::dwio::common::ExtractToHook<detail::DictionaryIndicesHook>,
173185
V::dense>(
174186
indicesFilter,
175187
&visitor.reader(),
176188
velox::RowSet(visitor.rows(), visitor.numRows()),
177-
detail::ExtractDictionaryIndices(buffer_.data(), startRowIndex));
189+
velox::dwio::common::ExtractToHook<detail::DictionaryIndicesHook>(
190+
&indicesHook));
178191
indicesVisitor.setRowIndex(startRowIndex);
179192
callReadWithVisitor(*indicesEncoding_, indicesVisitor, params);
180193
this->template readWithVisitorSlow<false>(visitor, params, [&] {

dwio/nimble/encodings/Encoding.h

+116-23
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ template <typename T, typename Filter, typename ExtractValues, bool kIsDense>
6666
using DecoderVisitor =
6767
velox::dwio::common::ColumnVisitor<T, Filter, ExtractValues, kIsDense>;
6868

69+
using vector_size_t = velox::vector_size_t;
70+
6971
// Extra parameters that need to be persisted/used during a single call of
7072
// readWithVisitor at column reader level, which might span multiple calls of
7173
// readWithVisitor (one per chunk) in decoders.
@@ -75,14 +77,18 @@ struct ReadWithVisitorParams {
7577
// across potential mutliple chunks.
7678
std::function<uint64_t*()> makeReaderNulls;
7779

80+
// Initialize `SelectiveColumnReader::returnReaderNulls_' field. Need to be
81+
// called after decoding nulls in `NullableEncoding'.
82+
std::function<void()> initReturnReaderNulls;
83+
7884
// Create the result nulls if not already exists. Similar to
7985
// `makeReaderNulls', we create one single buffer for all the results nulls
8086
// across potential multiple chunks during one read.
8187
std::function<void()> prepareResultNulls;
8288

8389
// Number of rows scanned so far. Contains rows scanned in previous chunks
8490
// during this read call as well.
85-
velox::vector_size_t numScanned;
91+
vector_size_t numScanned;
8692
};
8793

8894
class Encoding {
@@ -342,13 +348,13 @@ class BufferedEncoding {
342348
std::array<T, BufferSize> buffer_;
343349
};
344350

345-
template <typename Visitor1, typename Visitor2>
346-
void checkCurrentRowEqual(const Visitor1& v1, const Visitor2& v2) {
347-
if (v1.atEnd()) {
348-
NIMBLE_DASSERT(v2.atEnd(), "");
351+
template <typename T, typename PhysicalType>
352+
T castFromPhysicalType(const PhysicalType& value) {
353+
if constexpr (isFloatingPointType<T>()) {
354+
static_assert(sizeof(T) == sizeof(PhysicalType));
355+
return reinterpret_cast<const T&>(value);
349356
} else {
350-
NIMBLE_DASSERT(!v2.atEnd(), "");
351-
NIMBLE_DASSERT(v1.currentRow() == v2.currentRow(), "");
357+
return value;
352358
}
353359
}
354360

@@ -362,10 +368,7 @@ void readWithVisitorSlow(
362368
constexpr bool kExtractToReader = std::is_same_v<
363369
typename DecoderVisitor::Extract,
364370
velox::dwio::common::ExtractToReader>;
365-
const uint64_t* nulls = nullptr;
366-
if (auto& nullsBuf = visitor.reader().nullsInReadRange()) {
367-
nulls = nullsBuf->template as<uint64_t>();
368-
}
371+
auto* nulls = visitor.reader().rawNullsInReadRange();
369372
if constexpr (kExtractToReader) {
370373
params.prepareResultNulls();
371374
}
@@ -378,32 +381,122 @@ void readWithVisitorSlow(
378381
numNonNulls -=
379382
velox::bits::countNulls(nulls, numScanned, visitor.currentRow());
380383
}
381-
skip(numNonNulls);
384+
if (numNonNulls > 0) {
385+
skip(numNonNulls);
386+
}
382387
numScanned = visitor.currentRow() + 1;
383388
}
384389
if (nulls && velox::bits::isBitNull(nulls, visitor.currentRow())) {
385390
if (!visitor.allowNulls()) {
386-
visitor.setRowIndex(visitor.rowIndex() + 1);
391+
visitor.addRowIndex(1);
387392
atEnd = visitor.atEnd();
388393
} else if (kExtractToReader && visitor.reader().returnReaderNulls()) {
389-
visitor.setRowIndex(visitor.rowIndex() + 1);
390-
visitor.setNumValues(visitor.reader().numValues() + 1);
394+
visitor.addRowIndex(1);
395+
visitor.addNumValues(1);
391396
atEnd = visitor.atEnd();
392397
} else {
393398
visitor.processNull(atEnd);
394399
}
395400
} else {
396-
auto value = decodeOne();
397-
if constexpr (isFloatingPointType<T>()) {
398-
if constexpr (sizeof(T) != sizeof(value)) {
399-
NIMBLE_UNREACHABLE(typeid(decltype(value)).name());
400-
}
401-
visitor.process(reinterpret_cast<const T&>(value), atEnd);
402-
} else {
403-
visitor.process(value, atEnd);
401+
visitor.process(castFromPhysicalType<T>(decodeOne()), atEnd);
402+
}
403+
}
404+
}
405+
406+
template <typename TEncoding, typename V>
407+
void readWithVisitorFast(
408+
TEncoding& encoding,
409+
V& visitor,
410+
ReadWithVisitorParams& params,
411+
const uint64_t* nulls) {
412+
constexpr bool kOutputNulls = !V::kHasFilter && !V::kHasHook;
413+
const auto numRows = visitor.numRows() - visitor.rowIndex();
414+
auto& outerRows = visitor.outerNonNullRows();
415+
if (!nulls) {
416+
encoding.template bulkScan<false>(
417+
visitor,
418+
params.numScanned,
419+
visitor.rows() + visitor.rowIndex(),
420+
numRows,
421+
velox::iota(visitor.numRows(), outerRows) + visitor.rowIndex());
422+
return;
423+
}
424+
// TODO: Store last non null index and num non-nulls so far in decoder to
425+
// accelerate multi-chunk decoding.
426+
const auto numNonNullsSoFar =
427+
velox::bits::countNonNulls(nulls, 0, params.numScanned);
428+
if constexpr (V::dense) {
429+
NIMBLE_DASSERT(
430+
!visitor.reader().hasNulls() || visitor.reader().returnReaderNulls(),
431+
"");
432+
outerRows.resize(numRows);
433+
auto numNonNulls = velox::simd::indicesOfSetBits(
434+
nulls, visitor.rowIndex(), visitor.numRows(), outerRows.data());
435+
outerRows.resize(numNonNulls);
436+
if (outerRows.empty()) {
437+
if constexpr (kOutputNulls) {
438+
visitor.addNumValues(numRows);
404439
}
440+
visitor.addRowIndex(numRows);
441+
} else {
442+
encoding.template bulkScan<true>(
443+
visitor,
444+
numNonNullsSoFar,
445+
visitor.rows() + numNonNullsSoFar,
446+
numNonNulls,
447+
outerRows.data());
448+
}
449+
return;
450+
}
451+
auto& innerRows = visitor.innerNonNullRows();
452+
int32_t tailSkip = -1;
453+
uint64_t* resultNulls = nullptr;
454+
uint8_t* chunkResultNulls = nullptr;
455+
if constexpr (kOutputNulls) {
456+
params.prepareResultNulls();
457+
resultNulls = visitor.reader().rawResultNulls();
458+
chunkResultNulls = reinterpret_cast<uint8_t*>(resultNulls) +
459+
velox::bits::nbytes(visitor.rowIndex());
460+
}
461+
bool anyNulls =
462+
velox::dwio::common::nonNullRowsFromSparse<V::kHasFilter, kOutputNulls>(
463+
nulls,
464+
velox::RowSet(visitor.rows() + visitor.rowIndex(), numRows),
465+
innerRows,
466+
outerRows,
467+
chunkResultNulls,
468+
tailSkip);
469+
if (anyNulls) {
470+
visitor.setHasNulls();
471+
}
472+
if (kOutputNulls && visitor.rowIndex() % 8 != 0) {
473+
velox::bits::copyBits(
474+
resultNulls,
475+
velox::bits::roundUp(visitor.rowIndex(), 8),
476+
resultNulls,
477+
visitor.rowIndex(),
478+
numRows);
479+
}
480+
if (!V::kHasFilter && visitor.rowIndex() > 0) {
481+
for (auto& row : outerRows) {
482+
row += visitor.rowIndex();
405483
}
406484
}
485+
if (innerRows.empty()) {
486+
if constexpr (kOutputNulls) {
487+
visitor.addNumValues(numRows);
488+
}
489+
visitor.addRowIndex(numRows);
490+
encoding.skip(tailSkip - numNonNullsSoFar);
491+
} else {
492+
encoding.template bulkScan<true>(
493+
visitor,
494+
numNonNullsSoFar,
495+
innerRows.data(),
496+
innerRows.size(),
497+
outerRows.data());
498+
encoding.skip(tailSkip);
499+
}
407500
}
408501

409502
} // namespace detail

0 commit comments

Comments
 (0)