mxm commented on code in PR #787: URL: https://github.com/apache/flink-kubernetes-operator/pull/787#discussion_r1509934269
########## flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java: ########## @@ -129,6 +131,11 @@ public boolean scaleResource( scalingSummaries, autoScalerEventHandler); + // Increase the biggest vertex's parallelism to the current taskmanager count's + // max-parallelism + updateBiggestVertexParallelism( Review Comment: This method needs to be called earlier because we already sent out the scaling event with the overrides. Having the overrides changed at this point would be a surprise to users. ########## flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java: ########## @@ -153,6 +160,52 @@ public boolean scaleResource( return true; } + private void updateBiggestVertexParallelism( + Context context, + Map<JobVertexID, Map<ScalingMetric, EvaluatedScalingMetric>> evaluatedMetrics, + Map<JobVertexID, ScalingSummary> scalingSummaries) { + int taskSlotsPerTm = context.getConfiguration().get(TaskManagerOptions.NUM_TASK_SLOTS); + + Map<Integer, Set<JobVertexID>> parallelismVertexMap = new HashMap<>(); + int maxParallelism = 0; + + for (Map.Entry<JobVertexID, ScalingSummary> entry : scalingSummaries.entrySet()) { + JobVertexID vertex = entry.getKey(); + ScalingSummary summary = entry.getValue(); + int newParallelism = summary.getNewParallelism(); + + // Update maxParallelism if a new maximum is found + if (newParallelism > maxParallelism) { + maxParallelism = newParallelism; + } + + // Map newParallelism to JobVertexID + parallelismVertexMap.computeIfAbsent(newParallelism, k -> new HashSet<>()).add(vertex); + } + + // After the loop, retrieve the JobVertexIDs with the maxParallelism value + Set<JobVertexID> verticesWithMaxParallelism = + parallelismVertexMap.getOrDefault(maxParallelism, new HashSet<>()); + + // Compute the maximum parallelism for a given taskSlotsPerTm + var time = maxParallelism / taskSlotsPerTm; + if (maxParallelism > time * taskSlotsPerTm) { + maxParallelism = (time + 1) * taskSlotsPerTm; + } + + final var finalMaxNewParallelism = maxParallelism; + + scalingSummaries.forEach( + (vertexID, summary) -> { + if (verticesWithMaxParallelism.contains(vertexID)) { + // Here, you update the newParallelism for each matching JobVertexID + summary.setNewParallelism( + finalMaxNewParallelism); // Assuming setNewParallelism updates the + // value Review Comment: ```suggestion ``` ########## flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java: ########## @@ -153,6 +160,52 @@ public boolean scaleResource( return true; } + private void updateBiggestVertexParallelism( + Context context, + Map<JobVertexID, Map<ScalingMetric, EvaluatedScalingMetric>> evaluatedMetrics, + Map<JobVertexID, ScalingSummary> scalingSummaries) { + int taskSlotsPerTm = context.getConfiguration().get(TaskManagerOptions.NUM_TASK_SLOTS); + + Map<Integer, Set<JobVertexID>> parallelismVertexMap = new HashMap<>(); + int maxParallelism = 0; + + for (Map.Entry<JobVertexID, ScalingSummary> entry : scalingSummaries.entrySet()) { + JobVertexID vertex = entry.getKey(); + ScalingSummary summary = entry.getValue(); + int newParallelism = summary.getNewParallelism(); + + // Update maxParallelism if a new maximum is found + if (newParallelism > maxParallelism) { + maxParallelism = newParallelism; + } + + // Map newParallelism to JobVertexID + parallelismVertexMap.computeIfAbsent(newParallelism, k -> new HashSet<>()).add(vertex); + } + + // After the loop, retrieve the JobVertexIDs with the maxParallelism value + Set<JobVertexID> verticesWithMaxParallelism = + parallelismVertexMap.getOrDefault(maxParallelism, new HashSet<>()); + + // Compute the maximum parallelism for a given taskSlotsPerTm + var time = maxParallelism / taskSlotsPerTm; + if (maxParallelism > time * taskSlotsPerTm) { + maxParallelism = (time + 1) * taskSlotsPerTm; + } Review Comment: This assumes that slot sharing is enabled and there is a single slot sharing group. If slot sharing is (partially) disabled, the computation logic is different. I think we only want to perform this type of adjustment when `num_task_slots_used == maxParallelism`. We have already added `TASKS_SLOTS_USED` metric to the global metrics which you can pass into this method. ########## flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java: ########## @@ -129,6 +131,11 @@ public boolean scaleResource( scalingSummaries, autoScalerEventHandler); + // Increase the biggest vertex's parallelism to the current taskmanager count's + // max-parallelism + updateBiggestVertexParallelism( Review Comment: There's also parallelism adjustment made to spread out Flink's KeyGroups evenly. The adjustment in this PR could interfere with this logic and lead to uneven spread of state / keys. ########## flink-autoscaler/src/main/java/org/apache/flink/autoscaler/ScalingExecutor.java: ########## @@ -153,6 +160,52 @@ public boolean scaleResource( return true; } + private void updateBiggestVertexParallelism( + Context context, + Map<JobVertexID, Map<ScalingMetric, EvaluatedScalingMetric>> evaluatedMetrics, + Map<JobVertexID, ScalingSummary> scalingSummaries) { + int taskSlotsPerTm = context.getConfiguration().get(TaskManagerOptions.NUM_TASK_SLOTS); + + Map<Integer, Set<JobVertexID>> parallelismVertexMap = new HashMap<>(); + int maxParallelism = 0; + + for (Map.Entry<JobVertexID, ScalingSummary> entry : scalingSummaries.entrySet()) { + JobVertexID vertex = entry.getKey(); + ScalingSummary summary = entry.getValue(); + int newParallelism = summary.getNewParallelism(); + + // Update maxParallelism if a new maximum is found + if (newParallelism > maxParallelism) { + maxParallelism = newParallelism; + } + + // Map newParallelism to JobVertexID + parallelismVertexMap.computeIfAbsent(newParallelism, k -> new HashSet<>()).add(vertex); + } + + // After the loop, retrieve the JobVertexIDs with the maxParallelism value + Set<JobVertexID> verticesWithMaxParallelism = + parallelismVertexMap.getOrDefault(maxParallelism, new HashSet<>()); + + // Compute the maximum parallelism for a given taskSlotsPerTm + var time = maxParallelism / taskSlotsPerTm; + if (maxParallelism > time * taskSlotsPerTm) { + maxParallelism = (time + 1) * taskSlotsPerTm; + } + + final var finalMaxNewParallelism = maxParallelism; + + scalingSummaries.forEach( + (vertexID, summary) -> { + if (verticesWithMaxParallelism.contains(vertexID)) { + // Here, you update the newParallelism for each matching JobVertexID + summary.setNewParallelism( + finalMaxNewParallelism); // Assuming setNewParallelism updates the Review Comment: I wonder, why only consider the vertex with the maximum parallelism? Why not +1 all the vertices? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org