hlteoh37 commented on code in PR #190: URL: https://github.com/apache/flink-connector-aws/pull/190#discussion_r1973549566
########## flink-connector-aws/flink-connector-dynamodb/src/main/java/org/apache/flink/connector/dynamodb/source/reader/PollingDynamoDbStreamsShardSplitReader.java: ########## @@ -57,48 +59,101 @@ public class PollingDynamoDbStreamsShardSplitReader new DynamoDbStreamRecordsWithSplitIds(Collections.emptyIterator(), null, false); private final StreamProxy dynamodbStreams; + private final Duration getRecordsIdlePollingTimeBetweenNonEmptyPolls; + private final Duration getRecordsIdlePollingTimeBetweenEmptyPolls; - private final Deque<DynamoDbStreamsShardSplitState> assignedSplits = new ArrayDeque<>(); + private final PriorityQueue<SplitContext> assignedSplits; private final Map<String, DynamoDbStreamsShardMetrics> shardMetricGroupMap; private final Set<String> pausedSplitIds = new HashSet<>(); public PollingDynamoDbStreamsShardSplitReader( StreamProxy dynamodbStreamsProxy, + Duration getRecordsIdlePollingTimeBetweenNonEmptyPolls, + Duration getRecordsIdlePollingTimeBetweenEmptyPolls, Map<String, DynamoDbStreamsShardMetrics> shardMetricGroupMap) { this.dynamodbStreams = dynamodbStreamsProxy; + this.getRecordsIdlePollingTimeBetweenNonEmptyPolls = + getRecordsIdlePollingTimeBetweenNonEmptyPolls; + this.getRecordsIdlePollingTimeBetweenEmptyPolls = + getRecordsIdlePollingTimeBetweenEmptyPolls; this.shardMetricGroupMap = shardMetricGroupMap; + this.assignedSplits = + new PriorityQueue<>( + (a, b) -> { + // First, handle paused splits + boolean aIsPaused = pausedSplitIds.contains(a.splitState.getSplitId()); + boolean bIsPaused = pausedSplitIds.contains(b.splitState.getSplitId()); + if (aIsPaused && !bIsPaused) { + return 1; + } + if (!aIsPaused && bIsPaused) { + return -1; + } + if (aIsPaused && bIsPaused) { + return 0; + } + + // Get next eligible time for both splits + long aNextEligibleTime = getNextEligibleTime(a); + long bNextEligibleTime = getNextEligibleTime(b); + + return Long.compare(aNextEligibleTime, bNextEligibleTime); + }); + } + + private long getNextEligibleTime(SplitContext splitContext) { + long requiredDelay = + splitContext.wasLastPollEmpty + ? getRecordsIdlePollingTimeBetweenEmptyPolls.toMillis() + : getRecordsIdlePollingTimeBetweenNonEmptyPolls.toMillis(); + + return splitContext.lastPollTimeMillis + requiredDelay; } @Override public RecordsWithSplitIds<Record> fetch() throws IOException { - DynamoDbStreamsShardSplitState splitState = assignedSplits.poll(); - if (splitState == null) { + if (assignedSplits.isEmpty()) { return INCOMPLETE_SHARD_EMPTY_RECORDS; } + SplitContext splitContext = assignedSplits.poll(); + + if (pausedSplitIds.contains(splitContext.splitState.getSplitId())) { + assignedSplits.add(splitContext); + return INCOMPLETE_SHARD_EMPTY_RECORDS; + } + + // Check if split is paused or not ready due to empty poll delay + long currentTime = System.currentTimeMillis(); + long nextEligibleTime = getNextEligibleTime(splitContext); - if (pausedSplitIds.contains(splitState.getSplitId())) { - assignedSplits.add(splitState); + if (nextEligibleTime > currentTime) { + assignedSplits.add(splitContext); + sleep(20); Review Comment: We should not call `Thread.sleep()` in this method. This will cause pauses in the Flink runtime processing. Can we instead store the "next read timestamp" in the split record, and the split back to the `Deque`? The next time we call this, we can check if the next read timestamp has arrived -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org