@@ -20,9 +20,10 @@ import org.apache.hadoop.conf.Configuration
20
20
import org .apache .hadoop .fs .Path
21
21
import org .apache .spark .sql .{AnalysisException , DataFrame , QueryTest , Row }
22
22
import org .apache .spark .sql .catalyst .plans .logical .{LogicalPlan , Project }
23
- import org .apache .spark .sql .execution .SortExec
23
+ import org .apache .spark .sql .execution .{ SortExec , SparkPlan => SparkPlanNode }
24
24
import org .apache .spark .sql .execution .datasources .{HadoopFsRelation , InMemoryFileIndex , LogicalRelation }
25
25
import org .apache .spark .sql .execution .exchange .ShuffleExchangeExec
26
+ import org .apache .spark .sql .execution .joins .SortMergeJoinExec
26
27
27
28
import com .microsoft .hyperspace .{Hyperspace , Implicits , SampleData , TestConfig , TestUtils }
28
29
import com .microsoft .hyperspace .index .IndexConstants .{GLOBBING_PATTERN_KEY , REFRESH_MODE_INCREMENTAL , REFRESH_MODE_QUICK }
@@ -670,13 +671,95 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
670
671
671
672
// Refreshed index as quick mode can be applied with Hybrid Scan config.
672
673
withSQLConf(TestConfig .HybridScanEnabled : _* ) {
673
- spark.disableHyperspace()
674
- val dfWithHyperspaceDisabled = query()
675
- val basePlan = dfWithHyperspaceDisabled.queryExecution.optimizedPlan
676
- spark.enableHyperspace()
677
- val dfWithHyperspaceEnabled = query()
678
- assert(! basePlan.equals(dfWithHyperspaceEnabled.queryExecution.optimizedPlan))
679
- checkAnswer(dfWithHyperspaceDisabled, dfWithHyperspaceEnabled)
674
+ spark.disableHyperspace()
675
+ val dfWithHyperspaceDisabled = query()
676
+ val basePlan = dfWithHyperspaceDisabled.queryExecution.optimizedPlan
677
+ spark.enableHyperspace()
678
+ val dfWithHyperspaceEnabled = query()
679
+ assert(! basePlan.equals(dfWithHyperspaceEnabled.queryExecution.optimizedPlan))
680
+ checkAnswer(dfWithHyperspaceDisabled, dfWithHyperspaceEnabled)
681
+ }
682
+ }
683
+ }
684
+ }
685
+
686
+ test(" Verify Hybrid Scan is not applied when shuffle is not removed." ) {
687
+ withTempPathAsString { testPath =>
688
+ val indexConfig = IndexConfig (" indexRight" , Seq (" c2" ), Seq (" c4" ))
689
+ val indexConfig2 = IndexConfig (" indexLeft" , Seq (" c2" ), Seq (" c3" ))
690
+ import spark .implicits ._
691
+ SampleData .testData
692
+ .toDF(" c1" , " c2" , " c3" , " c4" , " c5" )
693
+ .limit(10 )
694
+ .write
695
+ .parquet(testPath)
696
+ val df = spark.read.load(testPath)
697
+
698
+ withSQLConf(IndexConstants .INDEX_NUM_BUCKETS -> " 11" ) {
699
+ // Create an index with bucket num 11.
700
+ hyperspace.createIndex(df, indexConfig)
701
+ }
702
+ withSQLConf(IndexConstants .INDEX_NUM_BUCKETS -> " 12" ) {
703
+ // Create an index with bucket num 12.
704
+ hyperspace.createIndex(df, indexConfig2)
705
+ }
706
+
707
+ // Append to original data.
708
+ SampleData .testData
709
+ .toDF(" c1" , " c2" , " c3" , " c4" , " c5" )
710
+ .limit(3 )
711
+ .write
712
+ .mode(" append" )
713
+ .parquet(testPath)
714
+
715
+ {
716
+ // Create a join query.
717
+ val df2 = spark.read.parquet(testPath)
718
+
719
+ def query (): DataFrame = {
720
+ df2.select(" c2" , " c3" ).join(df2.select(" c2" , " c4" ), " c2" )
721
+ }
722
+
723
+ val inputFiles = df.inputFiles
724
+ val appendedFiles = df2.inputFiles.diff(inputFiles).map(new Path (_))
725
+
726
+ spark.enableHyperspace()
727
+ withSQLConf(TestConfig .HybridScanEnabled : _* ) {
728
+ def getShuffleCnt (sparkPlan : SparkPlanNode ): Long = {
729
+ sparkPlan.collect { case _ : ShuffleExchangeExec => true }.length
730
+ }
731
+ withSQLConf(IndexConstants .INDEX_HYBRID_SCAN_SHUFFLE_CHECK_ENABLED -> " false" ) {
732
+ val execPlan = query().queryExecution.executedPlan
733
+ val shuffleCnt = execPlan.collect {
734
+ case smj : SortMergeJoinExec =>
735
+ (getShuffleCnt(smj.left), getShuffleCnt(smj.right))
736
+ }.head
737
+ assert(shuffleCnt._1 === 1 )
738
+ // Right child of join has 2 shuffle nodes because of Hybrid Scan for appended files.
739
+ assert(shuffleCnt._2 === 2 )
740
+
741
+ // Verify indexes are used, and all index files are picked.
742
+ verifyIndexUsage(
743
+ query,
744
+ getIndexFilesPath(indexConfig.indexName, Seq (0 )) ++ appendedFiles ++
745
+ getIndexFilesPath(indexConfig2.indexName, Seq (0 )) ++ appendedFiles)
746
+ }
747
+ withSQLConf(IndexConstants .INDEX_HYBRID_SCAN_SHUFFLE_CHECK_ENABLED -> " true" ) {
748
+ val execPlan = query().queryExecution.executedPlan
749
+ val shuffleCnt = execPlan.collect {
750
+ case smj : SortMergeJoinExec =>
751
+ (getShuffleCnt(smj.left), getShuffleCnt(smj.right))
752
+ }.head
753
+ assert(shuffleCnt._1 === 1 )
754
+ // One shuffle node of right child is removed with shuffle count check.
755
+ assert(shuffleCnt._2 === 1 )
756
+
757
+ // For right child, indexRight can be still applied by FilterIndexRule.
758
+ verifyIndexUsage(
759
+ query,
760
+ getIndexFilesPath(indexConfig.indexName, Seq (0 )) ++ appendedFiles ++
761
+ getIndexFilesPath(indexConfig2.indexName, Seq (0 )) ++ appendedFiles)
762
+ }
680
763
}
681
764
}
682
765
}
0 commit comments