korlov42 commented on code in PR #4992:
URL: https://github.com/apache/ignite-3/pull/4992#discussion_r1905362633


##########
modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/metadata/IgniteMdRowCount.java:
##########
@@ -100,68 +106,334 @@ public double getRowCount(IgniteLimit rel, 
RelMetadataQuery mq) {
     }
 
     /**
-     * JoinRowCount.
-     * TODO Documentation https://issues.apache.org/jira/browse/IGNITE-15859
+     * Estimates the number of rows produced by a join operation.
+     *
+     * <p>This method calculates an estimated row count for a join by 
analyzing the join type,
+     * join keys, and the cardinality of the left and right inputs. It 
provides specialized
+     * handling for primary key and foreign key relationships. When certain 
metadata is unavailable
+     * or when specific conditions are not met, it falls back to Calcite's 
default implementation
+     * for estimating the row count.
+     *
+     * <p>Implementation details:</p>
+     * <ul>
+     *   <li>If the join type is not {@link JoinRelType#INNER}, Calcite's 
default implementation is used.</li>
+     *   <li>If the join is non-equi join, Calcite's default implementation is 
used.</li>
+     *   <li>The row counts of the left and right inputs are retrieved using 
+     *   {@link RelMetadataQuery#getRowCount}. If either value is unavailable, 
the result is {@code null}.</li>
+     *   <li>If the row counts are very small (≤ 1.0), the method uses the 
maximum row count as a fallback.</li>
+     *   <li>Join key origins are resolved for the left and right inputs, and 
relationships between tables 
+     *   (e.g., primary key or foreign key associations) are identified and 
grouped into join contexts.</li>
+     *   <li>If no valid join context is found, the method falls back to 
Calcite's implementation.</li>
+     *   <li>The base row count is determined by the type of join relationship:
+     *       <ul>
+     *           <li>For primary key-to-primary key joins, the row count is 
based on the smaller table, 
+     *           adjusted by a percentage of the larger table's rows.</li>
+     *           <li>For foreign key joins, the base table is determined based 
on which table is 
+     *           joined using non-primary key columns.</li>
+     *       </ul>
+     *   </li>
+     *   <li>An additional adjustment factor is applied for post-filtration 
conditions, such as extra join keys 
+     *   or non-equi conditions.</li>
+     *   <li>If metadata for the percentage of original rows is unavailable, 
the adjustment defaults to 1.0.</li>
+     * </ul>
+     *
+     * <p>If none of the above criteria are satisfied, the method defaults to 
+     * {@link RelMdUtil#getJoinRowCount} for the estimation.</p>
+     *
+     * @param mq The {@link RelMetadataQuery} used to retrieve metadata about 
relational expressions.
+     * @param rel The {@link Join} relational expression representing the join 
operation.
+     * @return The estimated number of rows resulting from the join, or {@code 
null} if the estimation cannot be determined.
+     *
+     * @see RelMetadataQuery#getRowCount
+     * @see RelMdUtil#getJoinRowCount
      */
-    @Nullable
-    public static Double joinRowCount(RelMetadataQuery mq, Join rel) {
-        if (!rel.getJoinType().projectsRight()) {
-            // Create a RexNode representing the selectivity of the
-            // semijoin filter and pass it to getSelectivity
-            RexNode semiJoinSelectivity =
-                    RelMdUtil.makeSemiJoinSelectivityRexNode(mq, rel);
+    public static @Nullable Double joinRowCount(RelMetadataQuery mq, Join rel) 
{
+        if (rel.getJoinType() != JoinRelType.INNER) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
 
-            return multiply(mq.getSelectivity(rel.getLeft(), 
semiJoinSelectivity),
-                    mq.getRowCount(rel.getLeft()));
+        JoinInfo joinInfo = rel.analyzeCondition();
+
+        if (joinInfo.pairs().isEmpty()) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
         }
 
-        // Row count estimates of 0 will be rounded up to 1.
-        // So, use maxRowCount where the product is very small.
-        final Double left = mq.getRowCount(rel.getLeft());
-        final Double right = mq.getRowCount(rel.getRight());
+        Double leftRowCount = mq.getRowCount(rel.getLeft());
+        Double rightRowCount = mq.getRowCount(rel.getRight());
 
-        if (left == null || right == null) {
+        if (leftRowCount == null || rightRowCount == null) {
             return null;
         }
 
-        if (left <= 1D || right <= 1D) {
+        // Row count estimates of 0 will be rounded up to 1.
+        // So, use maxRowCount where the product is very small.
+        if (leftRowCount <= 1.0 || rightRowCount <= 1.0) {
             Double max = mq.getMaxRowCount(rel);
-            if (max != null && max <= 1D) {
+            if (max != null && max <= 1.0) {
                 return max;
             }
         }
 
-        JoinInfo joinInfo = rel.analyzeCondition();
+        Int2ObjectMap<KeyColumnOrigin> columnsFromLeft = resolveOrigins(mq, 
rel.getLeft(), joinInfo.leftKeys);
+        Int2ObjectMap<KeyColumnOrigin> columnsFromRight = resolveOrigins(mq, 
rel.getRight(), joinInfo.rightKeys);
+
+        Map<TablesPair, JoinContext> joinContexts = new HashMap<>();
+        for (IntPair joinKeys : joinInfo.pairs()) {
+            KeyColumnOrigin leftKey = columnsFromLeft.get(joinKeys.source);
+            KeyColumnOrigin rightKey = columnsFromRight.get(joinKeys.target);
+
+            if (leftKey == null || rightKey == null) {
+                continue;
+            }
+
+            joinContexts.computeIfAbsent(
+                    new TablesPair(
+                            leftKey.origin.getOriginTable(),
+                            rightKey.origin.getOriginTable()
+                    ),
+                    key -> {
+                        IgniteTable leftTable = 
key.left.unwrap(IgniteTable.class);
+                        IgniteTable rightTable = 
key.right.unwrap(IgniteTable.class);
+
+                        assert leftTable != null && rightTable != null;
+
+                        int leftPkSize = leftTable.keyColumns().size();
+                        int rightPkSize = rightTable.keyColumns().size();
+
+                        return new JoinContext(leftPkSize, rightPkSize);
+                    }
+            ).countKeys(leftKey, rightKey);
+        }
+
+        if (joinContexts.isEmpty()) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
+
+        Iterator<JoinContext> it = joinContexts.values().iterator();
+        JoinContext context = it.next();
+        while (it.hasNext()) {
+            JoinContext nextContext = it.next();
+            if (nextContext.joinType().strength > context.joinType().strength) 
{
+                context = nextContext;
+            }
+
+            if (context.joinType().strength == 
JoiningRelationType.PK_ON_PK.strength) {
+                break;
+            }
+        }
+
+        if (context.joinType() == JoiningRelationType.UNKNOWN) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
+
+        double postFiltrationAdjustment = joinContexts.size() == 1 && 
joinInfo.isEqui() ? 1.0
+                // Extra join keys as well as non-equi conditions serves azs 
post-filtration,

Review Comment:
   fixed, thanks



##########
modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/metadata/IgniteMdRowCount.java:
##########
@@ -100,68 +106,334 @@ public double getRowCount(IgniteLimit rel, 
RelMetadataQuery mq) {
     }
 
     /**
-     * JoinRowCount.
-     * TODO Documentation https://issues.apache.org/jira/browse/IGNITE-15859
+     * Estimates the number of rows produced by a join operation.
+     *
+     * <p>This method calculates an estimated row count for a join by 
analyzing the join type,
+     * join keys, and the cardinality of the left and right inputs. It 
provides specialized
+     * handling for primary key and foreign key relationships. When certain 
metadata is unavailable
+     * or when specific conditions are not met, it falls back to Calcite's 
default implementation
+     * for estimating the row count.
+     *
+     * <p>Implementation details:</p>
+     * <ul>
+     *   <li>If the join type is not {@link JoinRelType#INNER}, Calcite's 
default implementation is used.</li>
+     *   <li>If the join is non-equi join, Calcite's default implementation is 
used.</li>
+     *   <li>The row counts of the left and right inputs are retrieved using 
+     *   {@link RelMetadataQuery#getRowCount}. If either value is unavailable, 
the result is {@code null}.</li>
+     *   <li>If the row counts are very small (≤ 1.0), the method uses the 
maximum row count as a fallback.</li>
+     *   <li>Join key origins are resolved for the left and right inputs, and 
relationships between tables 
+     *   (e.g., primary key or foreign key associations) are identified and 
grouped into join contexts.</li>
+     *   <li>If no valid join context is found, the method falls back to 
Calcite's implementation.</li>
+     *   <li>The base row count is determined by the type of join relationship:
+     *       <ul>
+     *           <li>For primary key-to-primary key joins, the row count is 
based on the smaller table, 
+     *           adjusted by a percentage of the larger table's rows.</li>
+     *           <li>For foreign key joins, the base table is determined based 
on which table is 
+     *           joined using non-primary key columns.</li>
+     *       </ul>
+     *   </li>
+     *   <li>An additional adjustment factor is applied for post-filtration 
conditions, such as extra join keys 
+     *   or non-equi conditions.</li>
+     *   <li>If metadata for the percentage of original rows is unavailable, 
the adjustment defaults to 1.0.</li>
+     * </ul>
+     *
+     * <p>If none of the above criteria are satisfied, the method defaults to 
+     * {@link RelMdUtil#getJoinRowCount} for the estimation.</p>
+     *
+     * @param mq The {@link RelMetadataQuery} used to retrieve metadata about 
relational expressions.
+     * @param rel The {@link Join} relational expression representing the join 
operation.
+     * @return The estimated number of rows resulting from the join, or {@code 
null} if the estimation cannot be determined.
+     *
+     * @see RelMetadataQuery#getRowCount
+     * @see RelMdUtil#getJoinRowCount
      */
-    @Nullable
-    public static Double joinRowCount(RelMetadataQuery mq, Join rel) {
-        if (!rel.getJoinType().projectsRight()) {
-            // Create a RexNode representing the selectivity of the
-            // semijoin filter and pass it to getSelectivity
-            RexNode semiJoinSelectivity =
-                    RelMdUtil.makeSemiJoinSelectivityRexNode(mq, rel);
+    public static @Nullable Double joinRowCount(RelMetadataQuery mq, Join rel) 
{
+        if (rel.getJoinType() != JoinRelType.INNER) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
 
-            return multiply(mq.getSelectivity(rel.getLeft(), 
semiJoinSelectivity),
-                    mq.getRowCount(rel.getLeft()));
+        JoinInfo joinInfo = rel.analyzeCondition();
+
+        if (joinInfo.pairs().isEmpty()) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
         }
 
-        // Row count estimates of 0 will be rounded up to 1.
-        // So, use maxRowCount where the product is very small.
-        final Double left = mq.getRowCount(rel.getLeft());
-        final Double right = mq.getRowCount(rel.getRight());
+        Double leftRowCount = mq.getRowCount(rel.getLeft());
+        Double rightRowCount = mq.getRowCount(rel.getRight());
 
-        if (left == null || right == null) {
+        if (leftRowCount == null || rightRowCount == null) {
             return null;
         }
 
-        if (left <= 1D || right <= 1D) {
+        // Row count estimates of 0 will be rounded up to 1.
+        // So, use maxRowCount where the product is very small.
+        if (leftRowCount <= 1.0 || rightRowCount <= 1.0) {
             Double max = mq.getMaxRowCount(rel);
-            if (max != null && max <= 1D) {
+            if (max != null && max <= 1.0) {
                 return max;
             }
         }
 
-        JoinInfo joinInfo = rel.analyzeCondition();
+        Int2ObjectMap<KeyColumnOrigin> columnsFromLeft = resolveOrigins(mq, 
rel.getLeft(), joinInfo.leftKeys);
+        Int2ObjectMap<KeyColumnOrigin> columnsFromRight = resolveOrigins(mq, 
rel.getRight(), joinInfo.rightKeys);
+
+        Map<TablesPair, JoinContext> joinContexts = new HashMap<>();
+        for (IntPair joinKeys : joinInfo.pairs()) {
+            KeyColumnOrigin leftKey = columnsFromLeft.get(joinKeys.source);
+            KeyColumnOrigin rightKey = columnsFromRight.get(joinKeys.target);
+
+            if (leftKey == null || rightKey == null) {
+                continue;
+            }
+
+            joinContexts.computeIfAbsent(
+                    new TablesPair(
+                            leftKey.origin.getOriginTable(),
+                            rightKey.origin.getOriginTable()
+                    ),
+                    key -> {
+                        IgniteTable leftTable = 
key.left.unwrap(IgniteTable.class);
+                        IgniteTable rightTable = 
key.right.unwrap(IgniteTable.class);
+
+                        assert leftTable != null && rightTable != null;
+
+                        int leftPkSize = leftTable.keyColumns().size();
+                        int rightPkSize = rightTable.keyColumns().size();
+
+                        return new JoinContext(leftPkSize, rightPkSize);
+                    }
+            ).countKeys(leftKey, rightKey);
+        }
+
+        if (joinContexts.isEmpty()) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
+
+        Iterator<JoinContext> it = joinContexts.values().iterator();
+        JoinContext context = it.next();
+        while (it.hasNext()) {
+            JoinContext nextContext = it.next();
+            if (nextContext.joinType().strength > context.joinType().strength) 
{
+                context = nextContext;
+            }
+
+            if (context.joinType().strength == 
JoiningRelationType.PK_ON_PK.strength) {
+                break;
+            }
+        }
+
+        if (context.joinType() == JoiningRelationType.UNKNOWN) {
+            // Fall-back to calcite's implementation.
+            return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition());
+        }
+
+        double postFiltrationAdjustment = joinContexts.size() == 1 && 
joinInfo.isEqui() ? 1.0
+                // Extra join keys as well as non-equi conditions serves azs 
post-filtration,
+                // therefore we need to adjust final result with a little 
factor.
+                : 0.7;
+
+        double baseRowCount;
+        Double percentageAdjustment;
+        if (context.joinType() == JoiningRelationType.PK_ON_PK) {
+            // Assume we have two fact tables SALES and RETURNS sharing the 
same primary key. Every item
+            // can be sold, but only items which were sold can be returned 
back, therefore
+            // size(SALES) > size(RETURNS). When joining SALES on RETURNS by 
primary key, the estimated
+            // result size will be the same as the size of the smallest table 
(RETURNS in this case),
+            // adjusted by the percentage of rows of the biggest table (SALES 
in this case; percentage
+            // adjustment is required to account for predicates pushed down to 
the table, e.g. we are
+            // interested in returns of items with certain category)
+            if (leftRowCount > rightRowCount) {
+                baseRowCount = rightRowCount;
+                percentageAdjustment = 
mq.getPercentageOriginalRows(rel.getLeft());
+            } else {
+                baseRowCount = leftRowCount;
+                percentageAdjustment = 
mq.getPercentageOriginalRows(rel.getRight());
+            }
+        } else {
+            // For foreign key joins the base table is the one which is joined 
by non-primary key columns.
+            if (context.joinType() == JoiningRelationType.FK_ON_PK) {
+                baseRowCount = leftRowCount;
+                percentageAdjustment = 
mq.getPercentageOriginalRows(rel.getRight());
+            } else {
+                assert context.joinType() == JoiningRelationType.PK_ON_FK : 
context.joinType();
+                baseRowCount = rightRowCount;
+                percentageAdjustment = 
mq.getPercentageOriginalRows(rel.getLeft());
+            }
+        }
+
+        if (percentageAdjustment == null) {
+            percentageAdjustment = 1.0; // No info, let's be conservative
+        }
+
+        return baseRowCount * percentageAdjustment * postFiltrationAdjustment;
+    }
+
+    private static Int2ObjectMap<KeyColumnOrigin> 
resolveOrigins(RelMetadataQuery mq, RelNode joinShoulder, ImmutableIntList 
keys) {
+        Int2ObjectMap<KeyColumnOrigin> origins = new Int2ObjectOpenHashMap<>();
+        for (int i : keys) {
+            if (origins.containsKey(i)) {
+                continue;
+            }
+
+            RelColumnOrigin origin = mq.getColumnOrigin(joinShoulder, i);
+            if (origin == null) {
+                continue;
+            }
+
+            IgniteTable table = 
origin.getOriginTable().unwrap(IgniteTable.class);
+            if (table == null) {
+                continue;
+            }
+
+            int positionInKey = 
table.keyColumns().indexOf(origin.getOriginColumnOrdinal());
+            origins.put(i, new KeyColumnOrigin(origin, positionInKey));
+        }
+
+        return origins;
+    }
+
+    private static class KeyColumnOrigin {
+        private final RelColumnOrigin origin;
+        private final int positionInKey;
+
+        KeyColumnOrigin(RelColumnOrigin origin, int positionInKey) {
+            this.origin = origin;
+            this.positionInKey = positionInKey;
+        }
+    }
+
+    private static class TablesPair {
+        private final RelOptTable left;
+        private final RelOptTable right;
+
+        TablesPair(RelOptTable left, RelOptTable right) {
+            this.left = left;
+            this.right = right;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+            TablesPair that = (TablesPair) o;
+            // Reference equality on purpose.
+            return left == that.left && right == that.right;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(left, right);
+        }
+    }
+
+    private static class JoinContext {
+        /** Used columns of primary key of table from left side. */
+        private final BitSet leftKeys;
+        /** Used columns of primary key of table from right side. */
+        private final BitSet rightKeys;
+
+        /**
+         * Used columns of primary key in both tables.
+         *
+         * <p>This bitset is initialized when PK of both tables has equal 
columns count, and
+         * bits are cleared when join pair contains columns with equal 
positions in PK of corresponding
+         * table. For example, having tables T1 and T2 with primary keys in 
both tables defined as
+         * CONSTRAINT PRIMARY KEY (a, b), in case of query
+         * {@code SELECT ... FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b = t2.b} 
commonKeys will be initialized
+         * and cleared, but in case of query {@code SELECT ... FROM t1 JOIN t2 
ON t1.a = t2.b AND t1.b = t2.a}
+         * (mind the join condition, where column A of one table compared with 
column B of another), will be 
+         * only initialized (since size of the primary keys are equal), but 
not cleared.
+         */
+        private final @Nullable BitSet commonKeys;
+
+        JoinContext(int leftPkSize, int rightPkSize) {
+            this.leftKeys = new BitSet();
+            this.rightKeys = new BitSet();
+            this.commonKeys = leftPkSize == rightPkSize ? new BitSet() : null;
 
-        ImmutableIntList leftKeys = joinInfo.leftKeys;
-        ImmutableIntList rightKeys = joinInfo.rightKeys;
+            leftKeys.set(0, leftPkSize);
+            rightKeys.set(0, rightPkSize);
 
-        double selectivity = mq.getSelectivity(rel, rel.getCondition());
+            if (commonKeys != null) {
+                assert leftPkSize == rightPkSize;
 
-        if (nullOrEmpty(leftKeys) || nullOrEmpty(rightKeys)) {
-            return left * right * selectivity;
+                commonKeys.set(0, leftPkSize);
+            }
         }
 
-        double leftDistinct = Util.first(
-                mq.getDistinctRowCount(rel.getLeft(), 
ImmutableBitSet.of(leftKeys), null), left);
-        double rightDistinct = Util.first(
-                mq.getDistinctRowCount(rel.getRight(), 
ImmutableBitSet.of(rightKeys), null), right);
+        void countKeys(KeyColumnOrigin left, KeyColumnOrigin right) {
+            if (left.positionInKey >= 0) {
+                leftKeys.clear(left.positionInKey);
+            }
 
-        double leftCardinality = leftDistinct / left;
-        double rightCardinality = rightDistinct / right;
+            if (right.positionInKey >= 0) {
+                rightKeys.clear(right.positionInKey);
+            }
 
-        double rowsCount = (Math.min(left, right) / (leftCardinality * 
rightCardinality)) * selectivity;
+            if (commonKeys != null && left.positionInKey == 
right.positionInKey && left.positionInKey >= 0) {
+                commonKeys.clear(left.positionInKey);
+            }
+        }
 
-        JoinRelType type = rel.getJoinType();
+        JoiningRelationType joinType() {
+            if (commonKeys != null && commonKeys.isEmpty()) {
+                return JoiningRelationType.PK_ON_PK;
+            }
+
+            if (rightKeys.isEmpty()) {
+                return JoiningRelationType.FK_ON_PK;
+            }
+
+            if (leftKeys.isEmpty()) {
+                return JoiningRelationType.PK_ON_FK;
+            }
 
-        if (type == JoinRelType.LEFT) {
-            rowsCount += left;
-        } else if (type == JoinRelType.RIGHT) {
-            rowsCount += right;
-        } else if (type == JoinRelType.FULL) {
-            rowsCount += left + right;
+            return JoiningRelationType.UNKNOWN;
         }
+    }
+
+    /** Enumeration of join types by their semantic. */
+    private enum JoiningRelationType {
+        /**
+         * Join by non-primary key columns.
+         *
+         * <p>Semantic is unknown.
+         */
+        UNKNOWN(0),
+        /**
+         * Join by primary keys on non-primary keys.
+         *
+         * <p>Currently we don't support Foreign Keys, thus will assume such 
types of joins

Review Comment:
   fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to