Support filter with indexes on nested fields

andrei-ionescu · andrei-ionescu · commit 30cc3ef914e5 · 2021-03-18T20:01:14.000+02:00
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/FilterIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/FilterIndexRule.scala
@@ -18,17 +18,18 @@ package com.microsoft.hyperspace.index.rules
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.analysis.CleanupAliases
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 
 import com.microsoft.hyperspace.{ActiveSparkSession, Hyperspace}
 import com.microsoft.hyperspace.actions.Constants
 import com.microsoft.hyperspace.index.IndexLogEntry
 import com.microsoft.hyperspace.index.rankers.FilterIndexRanker
+import com.microsoft.hyperspace.index.rules.PlanUtils._
 import com.microsoft.hyperspace.index.sources.FileBasedRelation
 import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEventLogging, HyperspaceIndexUsageEvent}
-import com.microsoft.hyperspace.util.{HyperspaceConf, ResolverUtils}
+import com.microsoft.hyperspace.util.{HyperspaceConf, ResolverUtils, SchemaUtils}
 
 /**
  * FilterIndex rule looks for opportunities in a logical plan to replace
@@ -113,8 +114,8 @@ object FilterIndexRule
 
         val candidateIndexes = allIndexes.filter { index =>
           indexCoversPlan(
-            outputColumns,
-            filterColumns,
+            SchemaUtils.prefixNestedFieldNames(outputColumns),
+            SchemaUtils.prefixNestedFieldNames(filterColumns),
             index.indexedColumns,
             index.includedColumns)
         }
@@ -136,7 +137,6 @@ object FilterIndexRule
    * @param filterColumns List of columns in filter predicate.
    * @param indexedColumns List of indexed columns (e.g. from an index being checked)
    * @param includedColumns List of included columns (e.g. from an index being checked)
-   * @param fileFormat FileFormat for input relation in original logical plan.
    * @return 'true' if
    *         1. Index fully covers output and filter columns, and
    *         2. Filter predicate contains first column in index's 'indexed' columns.
@@ -168,9 +168,17 @@ object ExtractFilterNode {
       val projectColumnNames = CleanupAliases(project)
         .asInstanceOf[Project]
         .projectList
-        .map(_.references.map(_.asInstanceOf[AttributeReference].name))
+        .map(extractNamesFromExpression)
         .flatMap(_.toSeq)
-      val filterColumnNames = condition.references.map(_.name).toSeq
+      val filterColumnNames = extractNamesFromExpression(condition).toSeq
+        .sortBy(-_.length)
+        .foldLeft(Seq.empty[String]) { (acc, e) =>
+          if (!acc.exists(i => i.startsWith(e))) {
+            acc :+ e
+          } else {
+            acc
+          }
+        }
 
       Some(project, filter, projectColumnNames, filterColumnNames)
 
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/PlanUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/PlanUtils.scala
@@ -0,0 +1,165 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.rules
+
+import scala.util.Try
+
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, GetStructField}
+import org.apache.spark.sql.types.{DataType, StructType}
+
+import com.microsoft.hyperspace.util.SchemaUtils
+
+object PlanUtils {
+
+  /**
+   * The method extract field names from a Spark Catalyst [[Expression]].
+   *
+   * @param exp The Spark Catalyst expression from which to extract names.
+   * @return A set of distinct field names.
+   */
+  def extractNamesFromExpression(exp: Expression): Set[String] = {
+    exp match {
+      case AttributeReference(name, _, _, _) =>
+        Set(s"$name")
+      case otherExp =>
+        otherExp.containsChild.map {
+          case g: GetStructField =>
+            s"${getChildNameFromStruct(g)}"
+          case e: Expression =>
+            extractNamesFromExpression(e).filter(_.nonEmpty).mkString(".")
+          case _ => ""
+        }
+    }
+  }
+
+  /**
+   * Given a [[GetStructField]] expression for a nested field (aka a struct)
+   * the method will extract the full field `.` (dot) separated name.
+   *
+   * @param field The [[GetStructField]] field from which we want to extract
+   *              the name.
+   * @return A field name `.` (dot) separated if nested.
+   */
+  def getChildNameFromStruct(field: GetStructField): String = {
+    field.child match {
+      case f: GetStructField =>
+        s"${getChildNameFromStruct(f)}.${field.name.get}"
+      case a: AttributeReference =>
+        s"${a.name}.${field.name.get}"
+      case _ =>
+        s"${field.name.get}"
+    }
+  }
+
+  /**
+   * Given an Spark Catalyst [[Expression]] and a field name the method extracts
+   * the parent search expression and the expression that contains the field name
+   * @param exp The Spark Catalyst [[Expression]] to extract from.
+   * @param name The field name to search for.
+   * @return A tuple with the parent expression and the leaf expression that
+   *         contains the given name.
+   */
+  def extractSearchQuery(exp: Expression, name: String): (Expression, Expression) = {
+    val splits = name.split(".")
+    val expFound = exp.find {
+      case a: AttributeReference if splits.forall(s => a.name.contains(s)) => true
+      case f: GetStructField if splits.forall(s => f.toString().contains(s)) => true
+      case _ => false
+    }.get
+    val parent = exp.find {
+      case e: Expression if e.containsChild.contains(expFound) => true
+      case _ => false
+    }.get
+    (parent, expFound)
+  }
+
+  /**
+   * Given an Spark Catalyst [[Expression]], a needle [[Expression]] and a replace
+   * [[Expression]] the method will replace the needle with the replacement into
+   * the parent expression.
+   *
+   * @param parent The parent Spark Catalyst [[Expression]] into which to replace.
+   * @param needle The Spark Catalyst [[Expression]] needle to search for.
+   * @param repl The replacement Spark Catalyst [[Expression]].
+   * @return A new Spark Catalyst [[Expression]].
+   */
+  def replaceInSearchQuery(
+      parent: Expression,
+      needle: Expression,
+      repl: Expression): Expression = {
+    parent.mapChildren { c =>
+      if (c == needle) {
+        repl
+      } else {
+        c
+      }
+    }
+  }
+
+  /**
+   * Given an Spark Catalyst [[Expression]] and a field name the method
+   * extracts the [[AttributeReference]] for that field name.
+   *
+   * @param exp The Spark Catalyst [[Expression]] to extract from.
+   * @param name The field name for which to extract the attribute reference.
+   * @return A Spark Catalyst [[AttributeReference]] pointing to the field name.
+   */
+  def extractAttributeRef(exp: Expression, name: String): AttributeReference = {
+    val splits = name.split(".")
+    val elem = exp.find {
+      case a: AttributeReference if splits.contains(a.name) => true
+      case _ => false
+    }
+    elem.get.asInstanceOf[AttributeReference]
+  }
+
+  /**
+   * Given a Spark Catalyst [[Expression]] and a field name the method
+   * extracts the type of the field as a Spark SQL [[DataType]].
+   *
+   * @param exp The Spark Catalyst [[Expression]] from which to extract the type.
+   * @param name The field name for which we need to get the type.
+   * @return A Spark SQL [[DataType]] of the given field name.
+   */
+  def extractTypeFromExpression(exp: Expression, name: String): DataType = {
+    val splits = name.split(".")
+    val elem = exp.flatMap {
+      case a: AttributeReference =>
+        if (splits.forall(s => a.name == s)) {
+          Some((name, a.dataType))
+        } else {
+          Try({
+            val h :: t = splits.toList
+            if (a.name == h && a.dataType.isInstanceOf[StructType]) {
+              val currentDataType = a.dataType.asInstanceOf[StructType]
+              val foldedFields = t.foldLeft(Seq.empty[(String, DataType)]) { (acc, i) =>
+                val idx = currentDataType.indexWhere(_.name.equalsIgnoreCase(i))
+                acc :+ (i, currentDataType(idx).dataType)
+              }
+              Some(foldedFields.last)
+            } else {
+              None
+            }
+          }).getOrElse(None)
+        }
+      case f: GetStructField if splits.forall(s => f.toString().contains(s)) =>
+        Some((name, f.dataType))
+      case _ => None
+    }
+    elem.find(e => e._1 == name || e._1 == splits.last).get._2
+  }
+}
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, In, Literal, Not}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, ExprId, GetStructField, In, Literal, Not}
 import org.apache.spark.sql.catalyst.optimizer.OptimizeIn
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources._
@@ -32,8 +32,9 @@ import com.microsoft.hyperspace.Hyperspace
 import com.microsoft.hyperspace.index._
 import com.microsoft.hyperspace.index.IndexLogEntryTags.{HYBRIDSCAN_RELATED_CONFIGS, IS_HYBRIDSCAN_CANDIDATE}
 import com.microsoft.hyperspace.index.plans.logical.{BucketUnion, IndexHadoopFsRelation}
+import com.microsoft.hyperspace.index.rules.PlanUtils._
 import com.microsoft.hyperspace.index.sources.FileBasedRelation
-import com.microsoft.hyperspace.util.HyperspaceConf
+import com.microsoft.hyperspace.util.{HyperspaceConf, ResolverUtils, SchemaUtils}
 
 object RuleUtils {
 
@@ -286,10 +287,31 @@ object RuleUtils {
           new ParquetFileFormat,
           Map(IndexConstants.INDEX_RELATION_IDENTIFIER))(spark, index)
 
-        val updatedOutput = relation.plan.output
-          .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
-          .map(_.asInstanceOf[AttributeReference])
+        val flatSchema =
+          ResolverUtils.resolve(spark, index.indexedColumns ++ index.includedColumns, relation.plan)
+        // SchemaUtils.escapeFieldNames(SchemaUtils.flatten(relation.plan.schema))
+        val updatedOutput =
+          if (flatSchema.isDefined && SchemaUtils.containsNestedFieldNames(flatSchema.get)) {
+            indexFsRelation.schema.flatMap { s =>
+              val exprId = getFieldPosition(index, s.name)
+              relation.plan.output.find(a => s.name.contains(a.name)).map { a =>
+                AttributeReference(s.name, s.dataType, a.nullable, a.metadata)(
+                  ExprId(exprId),
+                  a.qualifier)
+              }
+            }
+          } else {
+            relation.plan.output
+                .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
+                .map(_.asInstanceOf[AttributeReference])
+          }
         relation.createLogicalRelation(indexFsRelation, updatedOutput)
+
+      case p: Project if provider.isSupportedProject(p) =>
+        transformProject(p, index)
+
+      case f: Filter if provider.isSupportedFilter(f) =>
+        transformFilter(f, index)
     }
   }
 
@@ -576,4 +598,56 @@ object RuleUtils {
     assert(shuffleInjected)
     shuffled
   }
+
+  private def transformProject(project: Project, index: IndexLogEntry): Project = {
+    val projectedFields = project.projectList.map { exp =>
+      val fieldName = extractNamesFromExpression(exp).head
+      val escapedFieldName = SchemaUtils.prefixNestedFieldName(fieldName)
+      val attr = extractAttributeRef(exp, fieldName)
+      val fieldType = extractTypeFromExpression(exp, fieldName)
+      val exprId = getFieldPosition(index, escapedFieldName)
+      attr.copy(escapedFieldName, fieldType, attr.nullable, attr.metadata)(
+        ExprId(exprId),
+        attr.qualifier)
+    }
+    project.copy(projectList = projectedFields)
+  }
+
+  private def transformFilter(filter: Filter, index: IndexLogEntry): Filter = {
+    val fieldNames = extractNamesFromExpression(filter.condition)
+    var mutableFilter = filter
+    fieldNames.foreach { fieldName =>
+      val escapedFieldName = SchemaUtils.prefixNestedFieldName(fieldName)
+      val nestedFields = getNestedFields(index)
+      if (nestedFields.nonEmpty &&
+          nestedFields.exists(i => i.equalsIgnoreCase(escapedFieldName))) {
+        val (parentExpresion, exp) =
+          extractSearchQuery(filter.condition, fieldName)
+        val fieldType = extractTypeFromExpression(exp, fieldName)
+        val attr = extractAttributeRef(exp, fieldName)
+        val exprId = getFieldPosition(index, escapedFieldName)
+        val newAttr = attr.copy(escapedFieldName, fieldType, attr.nullable, attr.metadata)(
+          ExprId(exprId),
+          attr.qualifier)
+        val newExp = exp match {
+          case _: GetStructField => newAttr
+          case other: Expression => other
+        }
+        val newParentExpression =
+          replaceInSearchQuery(parentExpresion, exp, newExp)
+        mutableFilter = filter.copy(condition = newParentExpression)
+      } else {
+        filter
+      }
+    }
+    mutableFilter
+  }
+
+  private def getNestedFields(index: IndexLogEntry): Seq[String] = {
+    index.schema.fieldNames.filter(_.startsWith(SchemaUtils.NESTED_FIELD_PREFIX))
+  }
+
+  private def getFieldPosition(index: IndexLogEntry, fieldName: String): Int = {
+    index.schema.fieldNames.indexWhere(_.equalsIgnoreCase(fieldName))
+  }
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/FileBasedSourceProviderManager.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/FileBasedSourceProviderManager.scala
@@ -19,12 +19,14 @@ package com.microsoft.hyperspace.index.sources
 import scala.util.{Success, Try}
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.util.hyperspace.Utils
 
 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.index.Relation
-import com.microsoft.hyperspace.util.{CacheWithTransform, HyperspaceConf}
+import com.microsoft.hyperspace.index.rules.PlanUtils._
+import com.microsoft.hyperspace.util.{CacheWithTransform, HyperspaceConf, SchemaUtils}
+
 
 /**
  * [[FileBasedSourceProviderManager]] is responsible for loading source providers which implements
@@ -107,6 +109,41 @@ class FileBasedSourceProviderManager(spark: SparkSession) {
     }
   }
 
+  /**
+   * Returns true if the given project is a supported project. If all of the registered
+   * providers return None, this returns false.
+   *
+   * @param project Project to check if it's supported.
+   * @return True if the given project is a supported relation.
+   */
+  def isSupportedProject(project: Project): Boolean = {
+    val containsNestedFields = SchemaUtils.containsNestedFieldNames(
+      project.projectList.flatMap(extractNamesFromExpression))
+    var containsNestedChildren = false
+    project.child.foreach {
+      case f: Filter =>
+        containsNestedChildren = containsNestedChildren || {
+          SchemaUtils.containsNestedFieldNames(SchemaUtils.removePrefixNestedFieldNames(
+            extractNamesFromExpression(f.condition).toSeq))
+        }
+      case _ =>
+    }
+    containsNestedFields || containsNestedChildren
+  }
+
+  /**
+   * Returns true if the given filter is a supported filter. If all of the registered
+   * providers return None, this returns false.
+   *
+   * @param filter Filter to check if it's supported.
+   * @return True if the given project is a supported relation.
+   */
+  def isSupportedFilter(filter: Filter): Boolean = {
+    val containsNestedFields = SchemaUtils.containsNestedFieldNames(
+      extractNamesFromExpression(filter.condition).toSeq)
+    containsNestedFields
+  }
+
   /**
    * Runs the given function 'f', which executes a [[FileBasedSourceProvider]]'s API that returns
    * [[Option]] for each provider built. This function ensures that only one provider returns