Add parquet bloom filter read support for int,bigint,string columns

nmahadevuni · Mar 6, 2025 · bea99c9 · bea99c9
1 parent 9a5946a
commit bea99c9
Show file tree

Hide file tree

Showing 24 changed files with 936 additions and 147 deletions.
diff --git a/velox/connectors/hive/HiveConfig.cpp b/velox/connectors/hive/HiveConfig.cpp
@@ -107,6 +107,13 @@ bool HiveConfig::isFileColumnNamesReadAsLowerCase(
       config_->get<bool>(kFileColumnNamesReadAsLowerCase, false));
 }
 
+bool HiveConfig::isParquetReadBloomFilter(
+    const config::ConfigBase* session) const {
+  return session->get<bool>(
+      kParquetReadBloomFilterSession,
+      config_->get<bool>(kParquetReadBloomFilter, false));
+}
+
 bool HiveConfig::isPartitionPathAsLowerCase(
     const config::ConfigBase* session) const {
   return session->get<bool>(kPartitionPathAsLowerCaseSession, true);

diff --git a/velox/connectors/hive/HiveConfig.h b/velox/connectors/hive/HiveConfig.h
@@ -80,6 +80,13 @@ class HiveConfig {
   static constexpr const char* kParquetUseColumnNamesSession =
       "parquet_use_column_names";
 
+  // Read bloom filters from parquet files to filter row groups.
+  static constexpr const char* kParquetReadBloomFilter =
+      "hive.parquet.read-bloom-filter";
+
+  static constexpr const char* kParquetReadBloomFilterSession =
+      "hive_parquet_read_bloom_filter";
+
   /// Reads the source file column name as lower case.
   static constexpr const char* kFileColumnNamesReadAsLowerCase =
       "file-column-names-read-as-lower-case";
@@ -199,6 +206,8 @@ class HiveConfig {
   bool isFileColumnNamesReadAsLowerCase(
       const config::ConfigBase* session) const;
 
+  bool isParquetReadBloomFilter(const config::ConfigBase* session) const;
+
   bool isPartitionPathAsLowerCase(const config::ConfigBase* session) const;
 
   bool allowNullPartitionKeys(const config::ConfigBase* session) const;

diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp
@@ -603,6 +603,11 @@ void configureReaderOptions(
 
     readerOptions.setFileFormat(hiveSplit->fileFormat);
   }
+
+  if (readerOptions.fileFormat() == dwio::common::FileFormat::PARQUET) {
+    readerOptions.setReadBloomFilter(
+        hiveConfig->isParquetReadBloomFilter(sessionProperties));
+  }
 }
 
 void configureRowReaderOptions(

diff --git a/velox/connectors/hive/tests/HiveConfigTest.cpp b/velox/connectors/hive/tests/HiveConfigTest.cpp
@@ -37,7 +37,7 @@ TEST(HiveConfigTest, defaultConfig) {
   ASSERT_EQ(hiveConfig.gcsCredentialsPath(), "");
   ASSERT_FALSE(hiveConfig.isOrcUseColumnNames(emptySession.get()));
   ASSERT_FALSE(hiveConfig.isFileColumnNamesReadAsLowerCase(emptySession.get()));
-
+  ASSERT_FALSE(hiveConfig.isParquetReadBloomFilter(emptySession.get()));
   ASSERT_EQ(hiveConfig.maxCoalescedBytes(emptySession.get()), 128 << 20);
   ASSERT_EQ(
       hiveConfig.maxCoalescedDistanceBytes(emptySession.get()), 512 << 10);
@@ -64,6 +64,7 @@ TEST(HiveConfigTest, overrideConfig) {
       {HiveConfig::kGcsCredentialsPath, "hey"},
       {HiveConfig::kOrcUseColumnNames, "true"},
       {HiveConfig::kFileColumnNamesReadAsLowerCase, "true"},
+      {HiveConfig::kParquetReadBloomFilter, "true"},
       {HiveConfig::kAllowNullPartitionKeys, "false"},
       {HiveConfig::kMaxCoalescedBytes, "100"},
       {HiveConfig::kMaxCoalescedDistance, "100kB"},
@@ -92,6 +93,7 @@ TEST(HiveConfigTest, overrideConfig) {
   ASSERT_EQ(hiveConfig.maxCoalescedBytes(emptySession.get()), 100);
   ASSERT_EQ(
       hiveConfig.maxCoalescedDistanceBytes(emptySession.get()), 100 << 10);
+  ASSERT_TRUE(hiveConfig.isParquetReadBloomFilter(emptySession.get()));
   ASSERT_EQ(hiveConfig.numCacheFileHandles(), 100);
   ASSERT_FALSE(hiveConfig.isFileHandleCacheEnabled());
   ASSERT_EQ(hiveConfig.sortWriterMaxOutputRows(emptySession.get()), 100);

diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h
@@ -497,6 +497,11 @@ class ReaderOptions : public io::ReaderOptions {
     return *this;
   }
 
+  ReaderOptions& setReadBloomFilter(bool flag) {
+    readBloomFilter_ = flag;
+    return *this;
+  }
+
   ReaderOptions& setIOExecutor(std::shared_ptr<folly::Executor> executor) {
     ioExecutor_ = std::move(executor);
     return *this;
@@ -567,6 +572,10 @@ class ReaderOptions : public io::ReaderOptions {
     return useColumnNamesForColumnMapping_;
   }
 
+  bool readBloomFilter() const {
+    return readBloomFilter_;
+  }
+
   const std::shared_ptr<random::RandomSkipTracker>& randomSkip() const {
     return randomSkip_;
   }
@@ -609,6 +618,7 @@ class ReaderOptions : public io::ReaderOptions {
   uint64_t filePreloadThreshold_{kDefaultFilePreloadThreshold};
   bool fileColumnNamesReadAsLowerCase_{false};
   bool useColumnNamesForColumnMapping_{false};
+  bool readBloomFilter_{false};
   std::shared_ptr<folly::Executor> ioExecutor_;
   std::shared_ptr<random::RandomSkipTracker> randomSkip_;
   std::shared_ptr<velox::common::ScanSpec> scanSpec_;

diff --git a/velox/dwio/parquet/common/BloomFilter.h b/velox/dwio/parquet/common/BloomFilter.h
@@ -68,31 +68,31 @@ class BloomFilter {
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(int32_t value) const = 0;
+  virtual uint64_t hashInt32(int32_t value) const = 0;
 
   /// Compute hash for 64 bits value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(int64_t value) const = 0;
+  virtual uint64_t hashInt64(int64_t value) const = 0;
 
   /// Compute hash for float value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(float value) const = 0;
+  virtual uint64_t hashFloat(float value) const = 0;
 
   /// Compute hash for double value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(double value) const = 0;
+  virtual uint64_t hashDouble(double value) const = 0;
 
   /// Compute hash for bytearray by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(const ByteArray* value) const = 0;
+  virtual uint64_t hashByteArray(const ByteArray* value) const = 0;
 
   /// Batch compute hashes for 32 bits values by using its plain encoding
   /// result.
@@ -101,8 +101,8 @@ class BloomFilter {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const int32_t* values, int numValues, uint64_t* hashes)
-      const = 0;
+  virtual void
+  hashesInt32(const int32_t* values, int numValues, uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for 64 bits values by using its plain encoding
   /// result.
@@ -111,16 +111,16 @@ class BloomFilter {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const int64_t* values, int numValues, uint64_t* hashes)
-      const = 0;
+  virtual void
+  hashesInt64(const int64_t* values, int numValues, uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for float values by using its plain encoding result.
   ///
   /// @param values values a pointer to the values to hash.
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const float* values, int numValues, uint64_t* hashes)
+  virtual void hashesFloat(const float* values, int numValues, uint64_t* hashes)
       const = 0;
 
   /// Batch compute hashes for double values by using its plain encoding result.
@@ -129,8 +129,8 @@ class BloomFilter {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const double* values, int numValues, uint64_t* hashes)
-      const = 0;
+  virtual void
+  hashesDouble(const double* values, int numValues, uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for bytearray values by using its plain encoding
   /// result.
@@ -139,8 +139,10 @@ class BloomFilter {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
-      const = 0;
+  virtual void hashesByteArray(
+      const ByteArray* values,
+      int numValues,
+      uint64_t* hashes) const = 0;
 
   virtual ~BloomFilter() = default;
 
@@ -248,54 +250,41 @@ class BlockSplitBloomFilter : public BloomFilter {
     return numBytes_;
   }
 
-  uint64_t hash(int32_t value) const override {
-    return hasher_->hash(value);
+  uint64_t hashInt32(int32_t value) const override {
+    return hasher_->hashInt32(value);
   }
-  uint64_t hash(int64_t value) const override {
-    return hasher_->hash(value);
+  uint64_t hashInt64(int64_t value) const override {
+    return hasher_->hashInt64(value);
   }
-  uint64_t hash(float value) const override {
-    return hasher_->hash(value);
+  uint64_t hashFloat(float value) const override {
+    return hasher_->hashFloat(value);
   }
-  uint64_t hash(double value) const override {
-    return hasher_->hash(value);
+  uint64_t hashDouble(double value) const override {
+    return hasher_->hashDouble(value);
   }
-  uint64_t hash(const ByteArray* value) const override {
-    return hasher_->hash(value);
+  uint64_t hashByteArray(const ByteArray* value) const override {
+    return hasher_->hashByteArray(value);
   }
 
-  void hashes(const int32_t* values, int numValues, uint64_t* hashes)
+  void hashesInt32(const int32_t* values, int numValues, uint64_t* hashes)
       const override {
-    hasher_->hashes(values, numValues, hashes);
+    hasher_->hashesInt32(values, numValues, hashes);
   }
-  void hashes(const int64_t* values, int numValues, uint64_t* hashes)
+  void hashesInt64(const int64_t* values, int numValues, uint64_t* hashes)
       const override {
-    hasher_->hashes(values, numValues, hashes);
+    hasher_->hashesInt64(values, numValues, hashes);
   }
-  void hashes(const float* values, int numValues, uint64_t* hashes)
+  void hashesFloat(const float* values, int numValues, uint64_t* hashes)
       const override {
-    hasher_->hashes(values, numValues, hashes);
+    hasher_->hashesFloat(values, numValues, hashes);
   }
-  void hashes(const double* values, int numValues, uint64_t* hashes)
+  void hashesDouble(const double* values, int numValues, uint64_t* hashes)
       const override {
-    hasher_->hashes(values, numValues, hashes);
+    hasher_->hashesDouble(values, numValues, hashes);
   }
-  void hashes(const ByteArray* values, int numValues, uint64_t* hashes)
+  void hashesByteArray(const ByteArray* values, int numValues, uint64_t* hashes)
       const override {
-    hasher_->hashes(values, numValues, hashes);
-  }
-
-  uint64_t hash(const int32_t* value) const {
-    return hasher_->hash(*value);
-  }
-  uint64_t hash(const int64_t* value) const {
-    return hasher_->hash(*value);
-  }
-  uint64_t hash(const float* value) const {
-    return hasher_->hash(*value);
-  }
-  uint64_t hash(const double* value) const {
-    return hasher_->hash(*value);
+    hasher_->hashesByteArray(values, numValues, hashes);
   }
 
   /// Deserialize the Bloom filter from an input stream. It is used when

diff --git a/velox/dwio/parquet/common/CMakeLists.txt b/velox/dwio/parquet/common/CMakeLists.txt
@@ -17,7 +17,8 @@ velox_add_library(
   BloomFilter.cpp
   XxHasher.cpp
   LevelComparison.cpp
-  LevelConversion.cpp)
+  LevelConversion.cpp
+  ParquetBloomFilter.h)
 
 velox_link_libraries(
   velox_dwio_parquet_common

diff --git a/velox/dwio/parquet/common/Hasher.h b/velox/dwio/parquet/common/Hasher.h
@@ -47,31 +47,31 @@ class Hasher {
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(int32_t value) const = 0;
+  virtual uint64_t hashInt32(int32_t value) const = 0;
 
   /// Compute hash for 64 bits value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(int64_t value) const = 0;
+  virtual uint64_t hashInt64(int64_t value) const = 0;
 
   /// Compute hash for float value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(float value) const = 0;
+  virtual uint64_t hashFloat(float value) const = 0;
 
   /// Compute hash for double value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(double value) const = 0;
+  virtual uint64_t hashDouble(double value) const = 0;
 
   /// Compute hash for ByteArray value by using its plain encoding result.
   ///
   /// @param value the value to hash.
   /// @return hash result.
-  virtual uint64_t hash(const ByteArray* value) const = 0;
+  virtual uint64_t hashByteArray(const ByteArray* value) const = 0;
 
   /// Batch compute hashes for 32 bits values by using its plain encoding
   /// result.
@@ -80,8 +80,10 @@ class Hasher {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const int32_t* values, int num_values, uint64_t* hashes)
-      const = 0;
+  virtual void hashesInt32(
+      const int32_t* values,
+      int num_values,
+      uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for 64 bits values by using its plain encoding
   /// result.
@@ -90,26 +92,30 @@ class Hasher {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const int64_t* values, int num_values, uint64_t* hashes)
-      const = 0;
+  virtual void hashesInt64(
+      const int64_t* values,
+      int num_values,
+      uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for float values by using its plain encoding result.
   ///
   /// @param values a pointer to the values to hash.
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const float* values, int num_values, uint64_t* hashes)
-      const = 0;
+  virtual void
+  hashesFloat(const float* values, int num_values, uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for double values by using its plain encoding result.
   ///
   /// @param values a pointer to the values to hash.
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const double* values, int num_values, uint64_t* hashes)
-      const = 0;
+  virtual void hashesDouble(
+      const double* values,
+      int num_values,
+      uint64_t* hashes) const = 0;
 
   /// Batch compute hashes for ByteArray values by using its plain encoding
   /// result.
@@ -118,8 +124,10 @@ class Hasher {
   /// @param num_values the number of values to hash.
   /// @param hashes a pointer to the output hash values, its length should be
   /// equal to num_values.
-  virtual void hashes(const ByteArray* values, int num_values, uint64_t* hashes)
-      const = 0;
+  virtual void hashesByteArray(
+      const ByteArray* values,
+      int num_values,
+      uint64_t* hashes) const = 0;
 
   virtual ~Hasher() = default;
 };