szehon-ho commented on code in PR #52443:
URL: https://github.com/apache/spark/pull/52443#discussion_r2380640672
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala:
##########
@@ -966,6 +969,51 @@ object KeyGroupedShuffleSpec {
}
}
+case class ShufflePartitionIdPassThroughSpec(
+ partitioning: ShufflePartitionIdPassThrough,
+ distribution: ClusteredDistribution) extends ShuffleSpec {
+
+ /**
+ * A sequence where each element is a set of positions of the partition key
to the cluster
+ * keys. Similar to HashShuffleSpec, this maps the partitioning expression
to positions
+ * in the distribution clustering keys.
+ */
+ lazy val keyPositions: mutable.BitSet = {
+ val distKeyToPos = mutable.Map.empty[Expression, mutable.BitSet]
+ distribution.clustering.zipWithIndex.foreach { case (distKey, distKeyPos)
=>
+ distKeyToPos.getOrElseUpdate(distKey.canonicalized,
mutable.BitSet.empty).add(distKeyPos)
+ }
+ distKeyToPos.getOrElse(partitioning.expr.child.canonicalized,
mutable.BitSet.empty)
+ }
+
+ override def isCompatibleWith(other: ShuffleSpec): Boolean = other match {
+ case SinglePartitionShuffleSpec =>
+ partitioning.numPartitions == 1
+ case otherPassThroughSpec @ ShufflePartitionIdPassThroughSpec(
+ otherPartitioning, otherDistribution) =>
+ // As ShufflePartitionIdPassThrough only allows a single expression
+ // as the partitioning expression, we check compatibility as follows:
+ // 1. Same number of clustering expressions
+ // 2. Same number of partitions
+ // 3. each pair of partitioning expression from both sides has
overlapping positions in their
Review Comment:
sorry, im a bit new to this and curious, do we need to actually check the
partition expression of repartitionByExpression() is compatible or not (like
for
[KeyGroupedPartitioning](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala#L855))
before we can skip shuffle, or I miss something ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]