lowka commented on code in PR #4992: URL: https://github.com/apache/ignite-3/pull/4992#discussion_r1903669268
########## modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/metadata/IgniteMdRowCount.java: ########## @@ -100,68 +106,334 @@ public double getRowCount(IgniteLimit rel, RelMetadataQuery mq) { } /** - * JoinRowCount. - * TODO Documentation https://issues.apache.org/jira/browse/IGNITE-15859 + * Estimates the number of rows produced by a join operation. + * + * <p>This method calculates an estimated row count for a join by analyzing the join type, + * join keys, and the cardinality of the left and right inputs. It provides specialized + * handling for primary key and foreign key relationships. When certain metadata is unavailable + * or when specific conditions are not met, it falls back to Calcite's default implementation + * for estimating the row count. + * + * <p>Implementation details:</p> + * <ul> + * <li>If the join type is not {@link JoinRelType#INNER}, Calcite's default implementation is used.</li> + * <li>If the join is non-equi join, Calcite's default implementation is used.</li> + * <li>The row counts of the left and right inputs are retrieved using + * {@link RelMetadataQuery#getRowCount}. If either value is unavailable, the result is {@code null}.</li> + * <li>If the row counts are very small (≤ 1.0), the method uses the maximum row count as a fallback.</li> + * <li>Join key origins are resolved for the left and right inputs, and relationships between tables + * (e.g., primary key or foreign key associations) are identified and grouped into join contexts.</li> + * <li>If no valid join context is found, the method falls back to Calcite's implementation.</li> + * <li>The base row count is determined by the type of join relationship: + * <ul> + * <li>For primary key-to-primary key joins, the row count is based on the smaller table, + * adjusted by a percentage of the larger table's rows.</li> + * <li>For foreign key joins, the base table is determined based on which table is + * joined using non-primary key columns.</li> + * </ul> + * </li> + * <li>An additional adjustment factor is applied for post-filtration conditions, such as extra join keys + * or non-equi conditions.</li> + * <li>If metadata for the percentage of original rows is unavailable, the adjustment defaults to 1.0.</li> + * </ul> + * + * <p>If none of the above criteria are satisfied, the method defaults to + * {@link RelMdUtil#getJoinRowCount} for the estimation.</p> + * + * @param mq The {@link RelMetadataQuery} used to retrieve metadata about relational expressions. + * @param rel The {@link Join} relational expression representing the join operation. + * @return The estimated number of rows resulting from the join, or {@code null} if the estimation cannot be determined. + * + * @see RelMetadataQuery#getRowCount + * @see RelMdUtil#getJoinRowCount */ - @Nullable - public static Double joinRowCount(RelMetadataQuery mq, Join rel) { - if (!rel.getJoinType().projectsRight()) { - // Create a RexNode representing the selectivity of the - // semijoin filter and pass it to getSelectivity - RexNode semiJoinSelectivity = - RelMdUtil.makeSemiJoinSelectivityRexNode(mq, rel); + public static @Nullable Double joinRowCount(RelMetadataQuery mq, Join rel) { + if (rel.getJoinType() != JoinRelType.INNER) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } - return multiply(mq.getSelectivity(rel.getLeft(), semiJoinSelectivity), - mq.getRowCount(rel.getLeft())); + JoinInfo joinInfo = rel.analyzeCondition(); + + if (joinInfo.pairs().isEmpty()) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); } - // Row count estimates of 0 will be rounded up to 1. - // So, use maxRowCount where the product is very small. - final Double left = mq.getRowCount(rel.getLeft()); - final Double right = mq.getRowCount(rel.getRight()); + Double leftRowCount = mq.getRowCount(rel.getLeft()); + Double rightRowCount = mq.getRowCount(rel.getRight()); - if (left == null || right == null) { + if (leftRowCount == null || rightRowCount == null) { return null; } - if (left <= 1D || right <= 1D) { + // Row count estimates of 0 will be rounded up to 1. + // So, use maxRowCount where the product is very small. + if (leftRowCount <= 1.0 || rightRowCount <= 1.0) { Double max = mq.getMaxRowCount(rel); - if (max != null && max <= 1D) { + if (max != null && max <= 1.0) { return max; } } - JoinInfo joinInfo = rel.analyzeCondition(); + Int2ObjectMap<KeyColumnOrigin> columnsFromLeft = resolveOrigins(mq, rel.getLeft(), joinInfo.leftKeys); + Int2ObjectMap<KeyColumnOrigin> columnsFromRight = resolveOrigins(mq, rel.getRight(), joinInfo.rightKeys); + + Map<TablesPair, JoinContext> joinContexts = new HashMap<>(); + for (IntPair joinKeys : joinInfo.pairs()) { + KeyColumnOrigin leftKey = columnsFromLeft.get(joinKeys.source); + KeyColumnOrigin rightKey = columnsFromRight.get(joinKeys.target); + + if (leftKey == null || rightKey == null) { + continue; + } + + joinContexts.computeIfAbsent( + new TablesPair( + leftKey.origin.getOriginTable(), + rightKey.origin.getOriginTable() + ), + key -> { + IgniteTable leftTable = key.left.unwrap(IgniteTable.class); + IgniteTable rightTable = key.right.unwrap(IgniteTable.class); + + assert leftTable != null && rightTable != null; + + int leftPkSize = leftTable.keyColumns().size(); + int rightPkSize = rightTable.keyColumns().size(); + + return new JoinContext(leftPkSize, rightPkSize); + } + ).countKeys(leftKey, rightKey); + } + + if (joinContexts.isEmpty()) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } + + Iterator<JoinContext> it = joinContexts.values().iterator(); + JoinContext context = it.next(); + while (it.hasNext()) { + JoinContext nextContext = it.next(); + if (nextContext.joinType().strength > context.joinType().strength) { + context = nextContext; + } + + if (context.joinType().strength == JoiningRelationType.PK_ON_PK.strength) { + break; + } + } + + if (context.joinType() == JoiningRelationType.UNKNOWN) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } + + double postFiltrationAdjustment = joinContexts.size() == 1 && joinInfo.isEqui() ? 1.0 + // Extra join keys as well as non-equi conditions serves azs post-filtration, Review Comment: > asz A typo ########## modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/metadata/IgniteMdRowCount.java: ########## @@ -100,68 +106,334 @@ public double getRowCount(IgniteLimit rel, RelMetadataQuery mq) { } /** - * JoinRowCount. - * TODO Documentation https://issues.apache.org/jira/browse/IGNITE-15859 + * Estimates the number of rows produced by a join operation. + * + * <p>This method calculates an estimated row count for a join by analyzing the join type, + * join keys, and the cardinality of the left and right inputs. It provides specialized + * handling for primary key and foreign key relationships. When certain metadata is unavailable + * or when specific conditions are not met, it falls back to Calcite's default implementation + * for estimating the row count. + * + * <p>Implementation details:</p> + * <ul> + * <li>If the join type is not {@link JoinRelType#INNER}, Calcite's default implementation is used.</li> + * <li>If the join is non-equi join, Calcite's default implementation is used.</li> + * <li>The row counts of the left and right inputs are retrieved using + * {@link RelMetadataQuery#getRowCount}. If either value is unavailable, the result is {@code null}.</li> + * <li>If the row counts are very small (≤ 1.0), the method uses the maximum row count as a fallback.</li> + * <li>Join key origins are resolved for the left and right inputs, and relationships between tables + * (e.g., primary key or foreign key associations) are identified and grouped into join contexts.</li> + * <li>If no valid join context is found, the method falls back to Calcite's implementation.</li> + * <li>The base row count is determined by the type of join relationship: + * <ul> + * <li>For primary key-to-primary key joins, the row count is based on the smaller table, + * adjusted by a percentage of the larger table's rows.</li> + * <li>For foreign key joins, the base table is determined based on which table is + * joined using non-primary key columns.</li> + * </ul> + * </li> + * <li>An additional adjustment factor is applied for post-filtration conditions, such as extra join keys + * or non-equi conditions.</li> + * <li>If metadata for the percentage of original rows is unavailable, the adjustment defaults to 1.0.</li> + * </ul> + * + * <p>If none of the above criteria are satisfied, the method defaults to + * {@link RelMdUtil#getJoinRowCount} for the estimation.</p> + * + * @param mq The {@link RelMetadataQuery} used to retrieve metadata about relational expressions. + * @param rel The {@link Join} relational expression representing the join operation. + * @return The estimated number of rows resulting from the join, or {@code null} if the estimation cannot be determined. + * + * @see RelMetadataQuery#getRowCount + * @see RelMdUtil#getJoinRowCount */ - @Nullable - public static Double joinRowCount(RelMetadataQuery mq, Join rel) { - if (!rel.getJoinType().projectsRight()) { - // Create a RexNode representing the selectivity of the - // semijoin filter and pass it to getSelectivity - RexNode semiJoinSelectivity = - RelMdUtil.makeSemiJoinSelectivityRexNode(mq, rel); + public static @Nullable Double joinRowCount(RelMetadataQuery mq, Join rel) { + if (rel.getJoinType() != JoinRelType.INNER) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } - return multiply(mq.getSelectivity(rel.getLeft(), semiJoinSelectivity), - mq.getRowCount(rel.getLeft())); + JoinInfo joinInfo = rel.analyzeCondition(); + + if (joinInfo.pairs().isEmpty()) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); } - // Row count estimates of 0 will be rounded up to 1. - // So, use maxRowCount where the product is very small. - final Double left = mq.getRowCount(rel.getLeft()); - final Double right = mq.getRowCount(rel.getRight()); + Double leftRowCount = mq.getRowCount(rel.getLeft()); + Double rightRowCount = mq.getRowCount(rel.getRight()); - if (left == null || right == null) { + if (leftRowCount == null || rightRowCount == null) { return null; } - if (left <= 1D || right <= 1D) { + // Row count estimates of 0 will be rounded up to 1. + // So, use maxRowCount where the product is very small. + if (leftRowCount <= 1.0 || rightRowCount <= 1.0) { Double max = mq.getMaxRowCount(rel); - if (max != null && max <= 1D) { + if (max != null && max <= 1.0) { return max; } } - JoinInfo joinInfo = rel.analyzeCondition(); + Int2ObjectMap<KeyColumnOrigin> columnsFromLeft = resolveOrigins(mq, rel.getLeft(), joinInfo.leftKeys); + Int2ObjectMap<KeyColumnOrigin> columnsFromRight = resolveOrigins(mq, rel.getRight(), joinInfo.rightKeys); + + Map<TablesPair, JoinContext> joinContexts = new HashMap<>(); + for (IntPair joinKeys : joinInfo.pairs()) { + KeyColumnOrigin leftKey = columnsFromLeft.get(joinKeys.source); + KeyColumnOrigin rightKey = columnsFromRight.get(joinKeys.target); + + if (leftKey == null || rightKey == null) { + continue; + } + + joinContexts.computeIfAbsent( + new TablesPair( + leftKey.origin.getOriginTable(), + rightKey.origin.getOriginTable() + ), + key -> { + IgniteTable leftTable = key.left.unwrap(IgniteTable.class); + IgniteTable rightTable = key.right.unwrap(IgniteTable.class); + + assert leftTable != null && rightTable != null; + + int leftPkSize = leftTable.keyColumns().size(); + int rightPkSize = rightTable.keyColumns().size(); + + return new JoinContext(leftPkSize, rightPkSize); + } + ).countKeys(leftKey, rightKey); + } + + if (joinContexts.isEmpty()) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } + + Iterator<JoinContext> it = joinContexts.values().iterator(); + JoinContext context = it.next(); + while (it.hasNext()) { + JoinContext nextContext = it.next(); + if (nextContext.joinType().strength > context.joinType().strength) { + context = nextContext; + } + + if (context.joinType().strength == JoiningRelationType.PK_ON_PK.strength) { + break; + } + } + + if (context.joinType() == JoiningRelationType.UNKNOWN) { + // Fall-back to calcite's implementation. + return RelMdUtil.getJoinRowCount(mq, rel, rel.getCondition()); + } + + double postFiltrationAdjustment = joinContexts.size() == 1 && joinInfo.isEqui() ? 1.0 + // Extra join keys as well as non-equi conditions serves azs post-filtration, Review Comment: > asz A typo. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org