Skip to content

Commit

Permalink
Bloom filter read refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
nmahadevuni committed Oct 21, 2024
1 parent 3aba4f0 commit 4d198b7
Show file tree
Hide file tree
Showing 12 changed files with 139 additions and 165 deletions.
73 changes: 30 additions & 43 deletions velox/dwio/parquet/common/BloomFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,31 +69,31 @@ class BloomFilter {
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(int32_t value) const = 0;
virtual uint64_t hashInt32(int32_t value) const = 0;

/// Compute hash for 64 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(int64_t value) const = 0;
virtual uint64_t hashInt64(int64_t value) const = 0;

/// Compute hash for float value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(float value) const = 0;
virtual uint64_t hashFloat(float value) const = 0;

/// Compute hash for double value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(double value) const = 0;
virtual uint64_t hashDouble(double value) const = 0;

/// Compute hash for bytearray by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(const ByteArray* value) const = 0;
virtual uint64_t hashByteArray(const ByteArray* value) const = 0;

/// Batch compute hashes for 32 bits values by using its plain encoding
/// result.
Expand All @@ -102,7 +102,7 @@ class BloomFilter {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const int32_t* values, int numValues, uint64_t* hashes)
virtual void hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
const = 0;

/// Batch compute hashes for 64 bits values by using its plain encoding
Expand All @@ -112,7 +112,7 @@ class BloomFilter {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const int64_t* values, int numValues, uint64_t* hashes)
virtual void hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
const = 0;

/// Batch compute hashes for float values by using its plain encoding result.
Expand All @@ -121,7 +121,7 @@ class BloomFilter {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const float* values, int numValues, uint64_t* hashes)
virtual void hashesFloat(const float* values, int numValues, uint64_t* hashes)
const = 0;

/// Batch compute hashes for double values by using its plain encoding result.
Expand All @@ -130,7 +130,7 @@ class BloomFilter {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const double* values, int numValues, uint64_t* hashes)
virtual void hashesDouble(const double* values, int numValues, uint64_t* hashes)
const = 0;

/// Batch compute hashes for bytearray values by using its plain encoding
Expand All @@ -140,7 +140,7 @@ class BloomFilter {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
virtual void hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
const = 0;

virtual ~BloomFilter() = default;
Expand Down Expand Up @@ -249,54 +249,41 @@ class BlockSplitBloomFilter : public BloomFilter {
return numBytes_;
}

uint64_t hash(int32_t value) const override {
return hasher_->hash(value);
uint64_t hashInt32(int32_t value) const override {
return hasher_->hashInt32(value);
}
uint64_t hash(int64_t value) const override {
return hasher_->hash(value);
uint64_t hashInt64(int64_t value) const override {
return hasher_->hashInt64(value);
}
uint64_t hash(float value) const override {
return hasher_->hash(value);
uint64_t hashFloat(float value) const override {
return hasher_->hashFloat(value);
}
uint64_t hash(double value) const override {
return hasher_->hash(value);
uint64_t hashDouble(double value) const override {
return hasher_->hashDouble(value);
}
uint64_t hash(const ByteArray* value) const override {
return hasher_->hash(value);
uint64_t hashByteArray(const ByteArray* value) const override {
return hasher_->hashByteArray(value);
}

void hashes(const int32_t* values, int numValues, uint64_t* hashes)
void hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
const override {
hasher_->hashes(values, numValues, hashes);
hasher_->hashesInt32(values, numValues, hashes);
}
void hashes(const int64_t* values, int numValues, uint64_t* hashes)
void hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
const override {
hasher_->hashes(values, numValues, hashes);
hasher_->hashesInt64(values, numValues, hashes);
}
void hashes(const float* values, int numValues, uint64_t* hashes)
void hashesFloat(const float* values, int numValues, uint64_t* hashes)
const override {
hasher_->hashes(values, numValues, hashes);
hasher_->hashesFloat(values, numValues, hashes);
}
void hashes(const double* values, int numValues, uint64_t* hashes)
void hashesDouble(const double* values, int numValues, uint64_t* hashes)
const override {
hasher_->hashes(values, numValues, hashes);
hasher_->hashesDouble(values, numValues, hashes);
}
void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
void hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
const override {
hasher_->hashes(values, numValues, hashes);
}

uint64_t hash(const int32_t* value) const {
return hasher_->hash(*value);
}
uint64_t hash(const int64_t* value) const {
return hasher_->hash(*value);
}
uint64_t hash(const float* value) const {
return hasher_->hash(*value);
}
uint64_t hash(const double* value) const {
return hasher_->hash(*value);
hasher_->hashesByteArray(values, numValues, hashes);
}

/// Deserialize the Bloom filter from an input stream. It is used when
Expand Down
20 changes: 10 additions & 10 deletions velox/dwio/parquet/common/Hasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,31 +47,31 @@ class Hasher {
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(int32_t value) const = 0;
virtual uint64_t hashInt32(int32_t value) const = 0;

/// Compute hash for 64 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(int64_t value) const = 0;
virtual uint64_t hashInt64(int64_t value) const = 0;

/// Compute hash for float value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(float value) const = 0;
virtual uint64_t hashFloat(float value) const = 0;

/// Compute hash for double value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(double value) const = 0;
virtual uint64_t hashDouble(double value) const = 0;

/// Compute hash for ByteArray value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t hash(const ByteArray* value) const = 0;
virtual uint64_t hashByteArray(const ByteArray* value) const = 0;

/// Batch compute hashes for 32 bits values by using its plain encoding
/// result.
Expand All @@ -80,7 +80,7 @@ class Hasher {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const int32_t* values, int num_values, uint64_t* hashes)
virtual void hashesInt32(const int32_t* values, int num_values, uint64_t* hashes)
const = 0;

/// Batch compute hashes for 64 bits values by using its plain encoding
Expand All @@ -90,7 +90,7 @@ class Hasher {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const int64_t* values, int num_values, uint64_t* hashes)
virtual void hashesInt64(const int64_t* values, int num_values, uint64_t* hashes)
const = 0;

/// Batch compute hashes for float values by using its plain encoding result.
Expand All @@ -99,7 +99,7 @@ class Hasher {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const float* values, int num_values, uint64_t* hashes)
virtual void hashesFloat(const float* values, int num_values, uint64_t* hashes)
const = 0;

/// Batch compute hashes for double values by using its plain encoding result.
Expand All @@ -108,7 +108,7 @@ class Hasher {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const double* values, int num_values, uint64_t* hashes)
virtual void hashesDouble(const double* values, int num_values, uint64_t* hashes)
const = 0;

/// Batch compute hashes for ByteArray values by using its plain encoding
Expand All @@ -118,7 +118,7 @@ class Hasher {
/// @param num_values the number of values to hash.
/// @param hashes a pointer to the output hash values, its length should be
/// equal to num_values.
virtual void hashes(const ByteArray* values, int num_values, uint64_t* hashes)
virtual void hashesByteArray(const ByteArray* values, int num_values, uint64_t* hashes)
const = 0;

virtual ~Hasher() = default;
Expand Down
12 changes: 6 additions & 6 deletions velox/dwio/parquet/common/ParquetBloomFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ class ParquetBloomFilter final : public common::AbstractBloomFilter {
std::shared_ptr<facebook::velox::parquet::BloomFilter> bloomFilter)
: bloomFilter_(bloomFilter) {}

bool mightContain(int32_t value) const override {
return bloomFilter_->findHash(bloomFilter_->hash(value));
bool mightContainInt32(int32_t value) const override {
return bloomFilter_->findHash(bloomFilter_->hashInt32(value));
}

bool mightContain(int64_t value) const override {
return bloomFilter_->findHash(bloomFilter_->hash(value));
bool mightContainInt64(int64_t value) const override {
return bloomFilter_->findHash(bloomFilter_->hashInt64(value));
}

bool mightContain(const std::string& value) const override {
bool mightContainString(const std::string& value) const override {
ByteArray byteArray{value};
return bloomFilter_->findHash(bloomFilter_->hash(&byteArray));
return bloomFilter_->findHash(bloomFilter_->hashByteArray(&byteArray));
}

private:
Expand Down
20 changes: 10 additions & 10 deletions velox/dwio/parquet/common/XxHasher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,50 +42,50 @@ void XxHashesHelper(

} // namespace

uint64_t XxHasher::hash(int32_t value) const {
uint64_t XxHasher::hashInt32(int32_t value) const {
return XxHashHelper(value, kParquetBloomXxHashSeed);
}

uint64_t XxHasher::hash(int64_t value) const {
uint64_t XxHasher::hashInt64(int64_t value) const {
return XxHashHelper(value, kParquetBloomXxHashSeed);
}

uint64_t XxHasher::hash(float value) const {
uint64_t XxHasher::hashFloat(float value) const {
return XxHashHelper(value, kParquetBloomXxHashSeed);
}

uint64_t XxHasher::hash(double value) const {
uint64_t XxHasher::hashDouble(double value) const {
return XxHashHelper(value, kParquetBloomXxHashSeed);
}

uint64_t XxHasher::hash(const ByteArray* value) const {
uint64_t XxHasher::hashByteArray(const ByteArray* value) const {
return XXH64(
reinterpret_cast<const void*>(value->ptr),
value->len,
kParquetBloomXxHashSeed);
}

void XxHasher::hashes(const int32_t* values, int numValues, uint64_t* hashes)
void XxHasher::hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
const {
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
}

void XxHasher::hashes(const int64_t* values, int numValues, uint64_t* hashes)
void XxHasher::hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
const {
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
}

void XxHasher::hashes(const float* values, int numValues, uint64_t* hashes)
void XxHasher::hashesFloat(const float* values, int numValues, uint64_t* hashes)
const {
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
}

void XxHasher::hashes(const double* values, int numValues, uint64_t* hashes)
void XxHasher::hashesDouble(const double* values, int numValues, uint64_t* hashes)
const {
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
}

void XxHasher::hashes(const ByteArray* values, int numValues, uint64_t* hashes)
void XxHasher::hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
const {
for (int i = 0; i < numValues; ++i) {
hashes[i] = XXH64(
Expand Down
20 changes: 10 additions & 10 deletions velox/dwio/parquet/common/XxHasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,21 @@ namespace facebook::velox::parquet {

class XxHasher : public Hasher {
public:
uint64_t hash(int32_t value) const override;
uint64_t hash(int64_t value) const override;
uint64_t hash(float value) const override;
uint64_t hash(double value) const override;
uint64_t hash(const ByteArray* value) const override;
uint64_t hashInt32(int32_t value) const override;
uint64_t hashInt64(int64_t value) const override;
uint64_t hashFloat(float value) const override;
uint64_t hashDouble(double value) const override;
uint64_t hashByteArray(const ByteArray* value) const override;

void hashes(const int32_t* values, int numValues, uint64_t* hashes)
void hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
const override;
void hashes(const int64_t* values, int numValues, uint64_t* hashes)
void hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
const override;
void hashes(const float* values, int numValues, uint64_t* hashes)
void hashesFloat(const float* values, int numValues, uint64_t* hashes)
const override;
void hashes(const double* values, int numValues, uint64_t* hashes)
void hashesDouble(const double* values, int numValues, uint64_t* hashes)
const override;
virtual void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
virtual void hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
const override;

static constexpr int kParquetBloomXxHashSeed = 0;
Expand Down
32 changes: 32 additions & 0 deletions velox/dwio/parquet/reader/ParquetData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,38 @@ std::pair<int64_t, int64_t> ParquetData::getRowGroupRegion(
return {fileOffset, length};
}

void ParquetData::setBloomFilterInputStream(
uint32_t rowGroupId,
dwio::common::BufferedInput& bufferedInput) {
if (bloomFilterInputStream_ != nullptr) {
return;
}
auto rowGroup = fileMetaDataPtr_.rowGroup(rowGroupId);
auto colChunk = rowGroup.columnChunk(type_->column());

if (!colChunk.hasBloomFilterOffset()) {
return;
}

VELOX_CHECK(
!colChunk.hasCryptoMetadata(),
"Cannot read encrypted bloom filter yet");

auto bloomFilterOffset = colChunk.bloomFilterOffset();
auto fileSize = bufferedInput.getInputStream()->getLength();
VELOX_CHECK_GT(
fileSize,
bloomFilterOffset,
"file size {} less or equal than bloom offset {}",
fileSize,
bloomFilterOffset);

bloomFilterInputStream_ = bufferedInput.read(
bloomFilterOffset,
fileSize - bloomFilterOffset,
dwio::common::LogType::FOOTER);
}

std::shared_ptr<BloomFilter> ParquetData::getBloomFilter(
const uint32_t rowGroupId) {
auto columnBloomFilterIter = columnBloomFilterMap_.find(rowGroupId);
Expand Down
Loading

0 comments on commit 4d198b7

Please sign in to comment.