Skip to content

Commit d6f09be

Browse files
committed
Bloom filter read refactor
1 parent 0107329 commit d6f09be

12 files changed

+186
-184
lines changed

velox/dwio/parquet/common/BloomFilter.h

+36-47
Original file line numberDiff line numberDiff line change
@@ -69,31 +69,31 @@ class BloomFilter {
6969
///
7070
/// @param value the value to hash.
7171
/// @return hash result.
72-
virtual uint64_t hash(int32_t value) const = 0;
72+
virtual uint64_t hashInt32(int32_t value) const = 0;
7373

7474
/// Compute hash for 64 bits value by using its plain encoding result.
7575
///
7676
/// @param value the value to hash.
7777
/// @return hash result.
78-
virtual uint64_t hash(int64_t value) const = 0;
78+
virtual uint64_t hashInt64(int64_t value) const = 0;
7979

8080
/// Compute hash for float value by using its plain encoding result.
8181
///
8282
/// @param value the value to hash.
8383
/// @return hash result.
84-
virtual uint64_t hash(float value) const = 0;
84+
virtual uint64_t hashFloat(float value) const = 0;
8585

8686
/// Compute hash for double value by using its plain encoding result.
8787
///
8888
/// @param value the value to hash.
8989
/// @return hash result.
90-
virtual uint64_t hash(double value) const = 0;
90+
virtual uint64_t hashDouble(double value) const = 0;
9191

9292
/// Compute hash for bytearray by using its plain encoding result.
9393
///
9494
/// @param value the value to hash.
9595
/// @return hash result.
96-
virtual uint64_t hash(const ByteArray* value) const = 0;
96+
virtual uint64_t hashByteArray(const ByteArray* value) const = 0;
9797

9898
/// Batch compute hashes for 32 bits values by using its plain encoding
9999
/// result.
@@ -102,8 +102,8 @@ class BloomFilter {
102102
/// @param num_values the number of values to hash.
103103
/// @param hashes a pointer to the output hash values, its length should be
104104
/// equal to num_values.
105-
virtual void hashes(const int32_t* values, int numValues, uint64_t* hashes)
106-
const = 0;
105+
virtual void
106+
hashesInt32(const int32_t* values, int numValues, uint64_t* hashes) const = 0;
107107

108108
/// Batch compute hashes for 64 bits values by using its plain encoding
109109
/// result.
@@ -112,16 +112,16 @@ class BloomFilter {
112112
/// @param num_values the number of values to hash.
113113
/// @param hashes a pointer to the output hash values, its length should be
114114
/// equal to num_values.
115-
virtual void hashes(const int64_t* values, int numValues, uint64_t* hashes)
116-
const = 0;
115+
virtual void
116+
hashesInt64(const int64_t* values, int numValues, uint64_t* hashes) const = 0;
117117

118118
/// Batch compute hashes for float values by using its plain encoding result.
119119
///
120120
/// @param values values a pointer to the values to hash.
121121
/// @param num_values the number of values to hash.
122122
/// @param hashes a pointer to the output hash values, its length should be
123123
/// equal to num_values.
124-
virtual void hashes(const float* values, int numValues, uint64_t* hashes)
124+
virtual void hashesFloat(const float* values, int numValues, uint64_t* hashes)
125125
const = 0;
126126

127127
/// Batch compute hashes for double values by using its plain encoding result.
@@ -130,8 +130,8 @@ class BloomFilter {
130130
/// @param num_values the number of values to hash.
131131
/// @param hashes a pointer to the output hash values, its length should be
132132
/// equal to num_values.
133-
virtual void hashes(const double* values, int numValues, uint64_t* hashes)
134-
const = 0;
133+
virtual void
134+
hashesDouble(const double* values, int numValues, uint64_t* hashes) const = 0;
135135

136136
/// Batch compute hashes for bytearray values by using its plain encoding
137137
/// result.
@@ -140,8 +140,10 @@ class BloomFilter {
140140
/// @param num_values the number of values to hash.
141141
/// @param hashes a pointer to the output hash values, its length should be
142142
/// equal to num_values.
143-
virtual void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
144-
const = 0;
143+
virtual void hashesByteArray(
144+
const ByteArray* values,
145+
int numValues,
146+
uint64_t* hashes) const = 0;
145147

146148
virtual ~BloomFilter() = default;
147149

@@ -249,54 +251,41 @@ class BlockSplitBloomFilter : public BloomFilter {
249251
return numBytes_;
250252
}
251253

252-
uint64_t hash(int32_t value) const override {
253-
return hasher_->hash(value);
254+
uint64_t hashInt32(int32_t value) const override {
255+
return hasher_->hashInt32(value);
254256
}
255-
uint64_t hash(int64_t value) const override {
256-
return hasher_->hash(value);
257+
uint64_t hashInt64(int64_t value) const override {
258+
return hasher_->hashInt64(value);
257259
}
258-
uint64_t hash(float value) const override {
259-
return hasher_->hash(value);
260+
uint64_t hashFloat(float value) const override {
261+
return hasher_->hashFloat(value);
260262
}
261-
uint64_t hash(double value) const override {
262-
return hasher_->hash(value);
263+
uint64_t hashDouble(double value) const override {
264+
return hasher_->hashDouble(value);
263265
}
264-
uint64_t hash(const ByteArray* value) const override {
265-
return hasher_->hash(value);
266+
uint64_t hashByteArray(const ByteArray* value) const override {
267+
return hasher_->hashByteArray(value);
266268
}
267269

268-
void hashes(const int32_t* values, int numValues, uint64_t* hashes)
270+
void hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
269271
const override {
270-
hasher_->hashes(values, numValues, hashes);
272+
hasher_->hashesInt32(values, numValues, hashes);
271273
}
272-
void hashes(const int64_t* values, int numValues, uint64_t* hashes)
274+
void hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
273275
const override {
274-
hasher_->hashes(values, numValues, hashes);
276+
hasher_->hashesInt64(values, numValues, hashes);
275277
}
276-
void hashes(const float* values, int numValues, uint64_t* hashes)
278+
void hashesFloat(const float* values, int numValues, uint64_t* hashes)
277279
const override {
278-
hasher_->hashes(values, numValues, hashes);
280+
hasher_->hashesFloat(values, numValues, hashes);
279281
}
280-
void hashes(const double* values, int numValues, uint64_t* hashes)
282+
void hashesDouble(const double* values, int numValues, uint64_t* hashes)
281283
const override {
282-
hasher_->hashes(values, numValues, hashes);
284+
hasher_->hashesDouble(values, numValues, hashes);
283285
}
284-
void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
286+
void hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
285287
const override {
286-
hasher_->hashes(values, numValues, hashes);
287-
}
288-
289-
uint64_t hash(const int32_t* value) const {
290-
return hasher_->hash(*value);
291-
}
292-
uint64_t hash(const int64_t* value) const {
293-
return hasher_->hash(*value);
294-
}
295-
uint64_t hash(const float* value) const {
296-
return hasher_->hash(*value);
297-
}
298-
uint64_t hash(const double* value) const {
299-
return hasher_->hash(*value);
288+
hasher_->hashesByteArray(values, numValues, hashes);
300289
}
301290

302291
/// Deserialize the Bloom filter from an input stream. It is used when

velox/dwio/parquet/common/Hasher.h

+23-15
Original file line numberDiff line numberDiff line change
@@ -47,31 +47,31 @@ class Hasher {
4747
///
4848
/// @param value the value to hash.
4949
/// @return hash result.
50-
virtual uint64_t hash(int32_t value) const = 0;
50+
virtual uint64_t hashInt32(int32_t value) const = 0;
5151

5252
/// Compute hash for 64 bits value by using its plain encoding result.
5353
///
5454
/// @param value the value to hash.
5555
/// @return hash result.
56-
virtual uint64_t hash(int64_t value) const = 0;
56+
virtual uint64_t hashInt64(int64_t value) const = 0;
5757

5858
/// Compute hash for float value by using its plain encoding result.
5959
///
6060
/// @param value the value to hash.
6161
/// @return hash result.
62-
virtual uint64_t hash(float value) const = 0;
62+
virtual uint64_t hashFloat(float value) const = 0;
6363

6464
/// Compute hash for double value by using its plain encoding result.
6565
///
6666
/// @param value the value to hash.
6767
/// @return hash result.
68-
virtual uint64_t hash(double value) const = 0;
68+
virtual uint64_t hashDouble(double value) const = 0;
6969

7070
/// Compute hash for ByteArray value by using its plain encoding result.
7171
///
7272
/// @param value the value to hash.
7373
/// @return hash result.
74-
virtual uint64_t hash(const ByteArray* value) const = 0;
74+
virtual uint64_t hashByteArray(const ByteArray* value) const = 0;
7575

7676
/// Batch compute hashes for 32 bits values by using its plain encoding
7777
/// result.
@@ -80,8 +80,10 @@ class Hasher {
8080
/// @param num_values the number of values to hash.
8181
/// @param hashes a pointer to the output hash values, its length should be
8282
/// equal to num_values.
83-
virtual void hashes(const int32_t* values, int num_values, uint64_t* hashes)
84-
const = 0;
83+
virtual void hashesInt32(
84+
const int32_t* values,
85+
int num_values,
86+
uint64_t* hashes) const = 0;
8587

8688
/// Batch compute hashes for 64 bits values by using its plain encoding
8789
/// result.
@@ -90,26 +92,30 @@ class Hasher {
9092
/// @param num_values the number of values to hash.
9193
/// @param hashes a pointer to the output hash values, its length should be
9294
/// equal to num_values.
93-
virtual void hashes(const int64_t* values, int num_values, uint64_t* hashes)
94-
const = 0;
95+
virtual void hashesInt64(
96+
const int64_t* values,
97+
int num_values,
98+
uint64_t* hashes) const = 0;
9599

96100
/// Batch compute hashes for float values by using its plain encoding result.
97101
///
98102
/// @param values a pointer to the values to hash.
99103
/// @param num_values the number of values to hash.
100104
/// @param hashes a pointer to the output hash values, its length should be
101105
/// equal to num_values.
102-
virtual void hashes(const float* values, int num_values, uint64_t* hashes)
103-
const = 0;
106+
virtual void
107+
hashesFloat(const float* values, int num_values, uint64_t* hashes) const = 0;
104108

105109
/// Batch compute hashes for double values by using its plain encoding result.
106110
///
107111
/// @param values a pointer to the values to hash.
108112
/// @param num_values the number of values to hash.
109113
/// @param hashes a pointer to the output hash values, its length should be
110114
/// equal to num_values.
111-
virtual void hashes(const double* values, int num_values, uint64_t* hashes)
112-
const = 0;
115+
virtual void hashesDouble(
116+
const double* values,
117+
int num_values,
118+
uint64_t* hashes) const = 0;
113119

114120
/// Batch compute hashes for ByteArray values by using its plain encoding
115121
/// result.
@@ -118,8 +124,10 @@ class Hasher {
118124
/// @param num_values the number of values to hash.
119125
/// @param hashes a pointer to the output hash values, its length should be
120126
/// equal to num_values.
121-
virtual void hashes(const ByteArray* values, int num_values, uint64_t* hashes)
122-
const = 0;
127+
virtual void hashesByteArray(
128+
const ByteArray* values,
129+
int num_values,
130+
uint64_t* hashes) const = 0;
123131

124132
virtual ~Hasher() = default;
125133
};

velox/dwio/parquet/common/ParquetBloomFilter.h

+6-6
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@ class ParquetBloomFilter final : public common::AbstractBloomFilter {
2727
std::shared_ptr<facebook::velox::parquet::BloomFilter> bloomFilter)
2828
: bloomFilter_(bloomFilter) {}
2929

30-
bool mightContain(int32_t value) const override {
31-
return bloomFilter_->findHash(bloomFilter_->hash(value));
30+
bool mightContainInt32(int32_t value) const override {
31+
return bloomFilter_->findHash(bloomFilter_->hashInt32(value));
3232
}
3333

34-
bool mightContain(int64_t value) const override {
35-
return bloomFilter_->findHash(bloomFilter_->hash(value));
34+
bool mightContainInt64(int64_t value) const override {
35+
return bloomFilter_->findHash(bloomFilter_->hashInt64(value));
3636
}
3737

38-
bool mightContain(const std::string& value) const override {
38+
bool mightContainString(const std::string& value) const override {
3939
ByteArray byteArray{value};
40-
return bloomFilter_->findHash(bloomFilter_->hash(&byteArray));
40+
return bloomFilter_->findHash(bloomFilter_->hashByteArray(&byteArray));
4141
}
4242

4343
private:

velox/dwio/parquet/common/XxHasher.cpp

+22-14
Original file line numberDiff line numberDiff line change
@@ -42,51 +42,59 @@ void XxHashesHelper(
4242

4343
} // namespace
4444

45-
uint64_t XxHasher::hash(int32_t value) const {
45+
uint64_t XxHasher::hashInt32(int32_t value) const {
4646
return XxHashHelper(value, kParquetBloomXxHashSeed);
4747
}
4848

49-
uint64_t XxHasher::hash(int64_t value) const {
49+
uint64_t XxHasher::hashInt64(int64_t value) const {
5050
return XxHashHelper(value, kParquetBloomXxHashSeed);
5151
}
5252

53-
uint64_t XxHasher::hash(float value) const {
53+
uint64_t XxHasher::hashFloat(float value) const {
5454
return XxHashHelper(value, kParquetBloomXxHashSeed);
5555
}
5656

57-
uint64_t XxHasher::hash(double value) const {
57+
uint64_t XxHasher::hashDouble(double value) const {
5858
return XxHashHelper(value, kParquetBloomXxHashSeed);
5959
}
6060

61-
uint64_t XxHasher::hash(const ByteArray* value) const {
61+
uint64_t XxHasher::hashByteArray(const ByteArray* value) const {
6262
return XXH64(
6363
reinterpret_cast<const void*>(value->ptr),
6464
value->len,
6565
kParquetBloomXxHashSeed);
6666
}
6767

68-
void XxHasher::hashes(const int32_t* values, int numValues, uint64_t* hashes)
69-
const {
68+
void XxHasher::hashesInt32(
69+
const int32_t* values,
70+
int numValues,
71+
uint64_t* hashes) const {
7072
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
7173
}
7274

73-
void XxHasher::hashes(const int64_t* values, int numValues, uint64_t* hashes)
74-
const {
75+
void XxHasher::hashesInt64(
76+
const int64_t* values,
77+
int numValues,
78+
uint64_t* hashes) const {
7579
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
7680
}
7781

78-
void XxHasher::hashes(const float* values, int numValues, uint64_t* hashes)
82+
void XxHasher::hashesFloat(const float* values, int numValues, uint64_t* hashes)
7983
const {
8084
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
8185
}
8286

83-
void XxHasher::hashes(const double* values, int numValues, uint64_t* hashes)
84-
const {
87+
void XxHasher::hashesDouble(
88+
const double* values,
89+
int numValues,
90+
uint64_t* hashes) const {
8591
XxHashesHelper(values, kParquetBloomXxHashSeed, numValues, hashes);
8692
}
8793

88-
void XxHasher::hashes(const ByteArray* values, int numValues, uint64_t* hashes)
89-
const {
94+
void XxHasher::hashesByteArray(
95+
const ByteArray* values,
96+
int numValues,
97+
uint64_t* hashes) const {
9098
for (int i = 0; i < numValues; ++i) {
9199
hashes[i] = XXH64(
92100
reinterpret_cast<const void*>(values[i].ptr),

0 commit comments

Comments
 (0)