gguptp commented on code in PR #190: URL: https://github.com/apache/flink-connector-aws/pull/190#discussion_r1973555290
########## flink-connector-aws/flink-connector-dynamodb/src/main/java/org/apache/flink/connector/dynamodb/source/reader/PollingDynamoDbStreamsShardSplitReader.java: ########## @@ -57,48 +59,101 @@ public class PollingDynamoDbStreamsShardSplitReader new DynamoDbStreamRecordsWithSplitIds(Collections.emptyIterator(), null, false); private final StreamProxy dynamodbStreams; + private final Duration getRecordsIdlePollingTimeBetweenNonEmptyPolls; + private final Duration getRecordsIdlePollingTimeBetweenEmptyPolls; - private final Deque<DynamoDbStreamsShardSplitState> assignedSplits = new ArrayDeque<>(); + private final PriorityQueue<SplitContext> assignedSplits; private final Map<String, DynamoDbStreamsShardMetrics> shardMetricGroupMap; private final Set<String> pausedSplitIds = new HashSet<>(); public PollingDynamoDbStreamsShardSplitReader( StreamProxy dynamodbStreamsProxy, + Duration getRecordsIdlePollingTimeBetweenNonEmptyPolls, + Duration getRecordsIdlePollingTimeBetweenEmptyPolls, Map<String, DynamoDbStreamsShardMetrics> shardMetricGroupMap) { this.dynamodbStreams = dynamodbStreamsProxy; + this.getRecordsIdlePollingTimeBetweenNonEmptyPolls = + getRecordsIdlePollingTimeBetweenNonEmptyPolls; + this.getRecordsIdlePollingTimeBetweenEmptyPolls = + getRecordsIdlePollingTimeBetweenEmptyPolls; this.shardMetricGroupMap = shardMetricGroupMap; + this.assignedSplits = + new PriorityQueue<>( + (a, b) -> { + // First, handle paused splits + boolean aIsPaused = pausedSplitIds.contains(a.splitState.getSplitId()); + boolean bIsPaused = pausedSplitIds.contains(b.splitState.getSplitId()); + if (aIsPaused && !bIsPaused) { + return 1; + } + if (!aIsPaused && bIsPaused) { + return -1; + } + if (aIsPaused && bIsPaused) { + return 0; + } + + // Get next eligible time for both splits + long aNextEligibleTime = getNextEligibleTime(a); + long bNextEligibleTime = getNextEligibleTime(b); + + return Long.compare(aNextEligibleTime, bNextEligibleTime); + }); + } + + private long getNextEligibleTime(SplitContext splitContext) { + long requiredDelay = + splitContext.wasLastPollEmpty + ? getRecordsIdlePollingTimeBetweenEmptyPolls.toMillis() + : getRecordsIdlePollingTimeBetweenNonEmptyPolls.toMillis(); + + return splitContext.lastPollTimeMillis + requiredDelay; } @Override public RecordsWithSplitIds<Record> fetch() throws IOException { - DynamoDbStreamsShardSplitState splitState = assignedSplits.poll(); - if (splitState == null) { + if (assignedSplits.isEmpty()) { return INCOMPLETE_SHARD_EMPTY_RECORDS; } + SplitContext splitContext = assignedSplits.poll(); + + if (pausedSplitIds.contains(splitContext.splitState.getSplitId())) { + assignedSplits.add(splitContext); + return INCOMPLETE_SHARD_EMPTY_RECORDS; + } + + // Check if split is paused or not ready due to empty poll delay + long currentTime = System.currentTimeMillis(); + long nextEligibleTime = getNextEligibleTime(splitContext); - if (pausedSplitIds.contains(splitState.getSplitId())) { - assignedSplits.add(splitState); + if (nextEligibleTime > currentTime) { + assignedSplits.add(splitContext); + sleep(20); Review Comment: yeah, i was initially doing that but without thread sleep, this leads to the job manager CPU runnign at 100%. is there any good way we can provide to control the fetch calls -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org