17
17
package org .apache .gluten .execution
18
18
19
19
import org .apache .spark .rdd .RDD
20
+ import org .apache .spark .rpc .GlutenDriverEndpoint
20
21
import org .apache .spark .sql .catalyst .expressions ._
21
- import org .apache .spark .sql .catalyst .optimizer .BuildSide
22
+ import org .apache .spark .sql .catalyst .optimizer .{ BuildRight , BuildSide }
22
23
import org .apache .spark .sql .catalyst .plans ._
23
- import org .apache .spark .sql .execution .SparkPlan
24
+ import org .apache .spark .sql .execution .{ SparkPlan , SQLExecution }
24
25
import org .apache .spark .sql .execution .joins .BuildSideRelation
25
26
import org .apache .spark .sql .vectorized .ColumnarBatch
26
27
27
28
import io .substrait .proto .JoinRel
28
29
30
+ object JoinTypeTransform {
31
+
32
+ // ExistenceJoin is introduced in #SPARK-14781. It returns all rows from the left table with
33
+ // a new column to indecate whether the row is matched in the right table.
34
+ // Indeed, the ExistenceJoin is transformed into left any join in CH.
35
+ // We don't have left any join in substrait, so use left semi join instead.
36
+ // and isExistenceJoin is set to true to indicate that it is an existence join.
37
+ def toSubstraitJoinType (sparkJoin : JoinType , buildRight : Boolean ): JoinRel .JoinType =
38
+ sparkJoin match {
39
+ case _ : InnerLike =>
40
+ JoinRel .JoinType .JOIN_TYPE_INNER
41
+ case FullOuter =>
42
+ JoinRel .JoinType .JOIN_TYPE_OUTER
43
+ case LeftOuter =>
44
+ if (! buildRight) {
45
+ JoinRel .JoinType .JOIN_TYPE_RIGHT
46
+ } else {
47
+ JoinRel .JoinType .JOIN_TYPE_LEFT
48
+ }
49
+ case RightOuter =>
50
+ if (! buildRight) {
51
+ JoinRel .JoinType .JOIN_TYPE_LEFT
52
+ } else {
53
+ JoinRel .JoinType .JOIN_TYPE_RIGHT
54
+ }
55
+ case LeftSemi =>
56
+ if (! buildRight) {
57
+ JoinRel .JoinType .JOIN_TYPE_RIGHT_SEMI
58
+ } else {
59
+ JoinRel .JoinType .JOIN_TYPE_LEFT_SEMI
60
+ }
61
+ case LeftAnti =>
62
+ if (! buildRight) {
63
+ JoinRel .JoinType .JOIN_TYPE_RIGHT_ANTI
64
+ } else {
65
+ JoinRel .JoinType .JOIN_TYPE_LEFT_ANTI
66
+ }
67
+ case ExistenceJoin (_) =>
68
+ if (! buildRight) {
69
+ throw new IllegalArgumentException (" Existence join should not switch children" )
70
+ }
71
+ JoinRel .JoinType .JOIN_TYPE_LEFT_SEMI
72
+ case _ =>
73
+ // TODO: Support cross join with Cross Rel
74
+ JoinRel .JoinType .UNRECOGNIZED
75
+ }
76
+
77
+ }
78
+
29
79
case class ShuffledHashJoinExecTransformer (
30
80
leftKeys : Seq [Expression ],
31
81
rightKeys : Seq [Expression ],
@@ -99,6 +149,9 @@ case class BroadcastHashJoinExecTransformer(
99
149
right,
100
150
isNullAwareAntiJoin) {
101
151
152
+ // Unique ID for builded table
153
+ lazy val buildBroadcastTableId : String = buildPlan.id.toString
154
+
102
155
override protected lazy val substraitJoinType : JoinRel .JoinType = joinType match {
103
156
case _ : InnerLike =>
104
157
JoinRel .JoinType .JOIN_TYPE_INNER
@@ -125,9 +178,38 @@ case class BroadcastHashJoinExecTransformer(
125
178
126
179
override def columnarInputRDDs : Seq [RDD [ColumnarBatch ]] = {
127
180
val streamedRDD = getColumnarInputRDDs(streamedPlan)
181
+ val executionId = sparkContext.getLocalProperty(SQLExecution .EXECUTION_ID_KEY )
182
+ if (executionId != null ) {
183
+ GlutenDriverEndpoint .collectResources(executionId, buildBroadcastTableId)
184
+ } else {
185
+ logWarning(
186
+ s " Can't not trace broadcast table data $buildBroadcastTableId" +
187
+ s " because execution id is null. " +
188
+ s " Will clean up until expire time. " )
189
+ }
190
+
128
191
val broadcast = buildPlan.executeBroadcast[BuildSideRelation ]()
129
- val broadcastRDD = VeloxBroadcastBuildSideRDD (sparkContext, broadcast)
192
+ val context =
193
+ BroadCastHashJoinContext (
194
+ buildKeyExprs,
195
+ joinType,
196
+ buildSide == BuildRight ,
197
+ condition.isDefined,
198
+ joinType.isInstanceOf [ExistenceJoin ],
199
+ buildPlan.output,
200
+ buildBroadcastTableId)
201
+ val broadcastRDD = VeloxBroadcastBuildSideRDD (sparkContext, broadcast, context)
130
202
// FIXME: Do we have to make build side a RDD?
131
203
streamedRDD :+ broadcastRDD
132
204
}
133
205
}
206
+
207
+ case class BroadCastHashJoinContext (
208
+ buildSideJoinKeys : Seq [Expression ],
209
+ joinType : JoinType ,
210
+ buildRight : Boolean ,
211
+ hasMixedFiltCondition : Boolean ,
212
+ isExistenceJoin : Boolean ,
213
+ buildSideStructure : Seq [Attribute ],
214
+ buildHashTableId : String ,
215
+ isNullAwareAntiJoin : Boolean = false )
0 commit comments