apoorvmittal10 commented on code in PR #19261: URL: https://github.com/apache/kafka/pull/19261#discussion_r2022314413
########## core/src/main/java/kafka/server/share/SharePartition.java: ########## @@ -2484,6 +2504,205 @@ private long startOffsetDuringInitialization(long partitionDataStartOffset) thro } } + private ShareAcquiredRecords maybeFilterAbortedTransactionalAcquiredRecords( + FetchPartitionData fetchPartitionData, + FetchIsolation isolationLevel, + ShareAcquiredRecords shareAcquiredRecords + ) { + if (isolationLevel != FetchIsolation.TXN_COMMITTED || fetchPartitionData.abortedTransactions.isEmpty() || fetchPartitionData.abortedTransactions.get().isEmpty()) + return shareAcquiredRecords; + // When FetchIsolation.TXN_COMMITTED is used as isolation level by the share group, we need to filter any + // transactions that were aborted/did not commit due to timeout. + List<AcquiredRecords> result = filterAbortedTransactionalAcquiredRecords(fetchPartitionData.records.batches(), + shareAcquiredRecords.acquiredRecords(), fetchPartitionData.abortedTransactions.get()); + int acquiredCount = 0; + for (AcquiredRecords records : result) { + acquiredCount += (int) (records.lastOffset() - records.firstOffset() + 1); + } + return new ShareAcquiredRecords(result, acquiredCount); + } + + private List<AcquiredRecords> filterAbortedTransactionalAcquiredRecords( + Iterable<? extends RecordBatch> batches, + List<AcquiredRecords> acquiredRecords, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + lock.writeLock().lock(); + try { + // The record batches that need to be archived in cachedState because they were a part of aborted transactions. + List<RecordBatch> recordsToArchive = fetchAbortedTransactionRecordBatches(batches, abortedTransactions); + for (RecordBatch recordBatch : recordsToArchive) { + // Archive the offsets/batches in the cached state. + NavigableMap<Long, InFlightBatch> subMap = fetchSubMapOrException(recordBatch); + archiveAcquiredBatchRecords(subMap, recordBatch); + } + return filterRecordBatchesFromAcquiredRecords(acquiredRecords, recordsToArchive); + } finally { + lock.writeLock().unlock(); + } + } + + // Visible for testing. + List<AcquiredRecords> filterRecordBatchesFromAcquiredRecords( + List<AcquiredRecords> acquiredRecords, + List<RecordBatch> recordsToArchive + ) { + lock.writeLock().lock(); + try { + List<AcquiredRecords> result = new ArrayList<>(); + + for (AcquiredRecords acquiredRecord : acquiredRecords) { + List<AcquiredRecords> tempAcquiredRecords = new ArrayList<>(); + tempAcquiredRecords.add(acquiredRecord); + for (RecordBatch recordBatch : recordsToArchive) { + List<AcquiredRecords> newAcquiredRecords = new ArrayList<>(); + for (AcquiredRecords temp : tempAcquiredRecords) { + // Check if record batch overlaps with the acquired records. + if (temp.firstOffset() <= recordBatch.lastOffset() && temp.lastOffset() >= recordBatch.baseOffset()) { + // Split the acquired record into parts before, inside, and after the overlapping record batch. + if (temp.firstOffset() < recordBatch.baseOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(temp.firstOffset()) + .setLastOffset(recordBatch.baseOffset() - 1) + .setDeliveryCount((short) 1)); + } + if (temp.lastOffset() > recordBatch.lastOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(recordBatch.lastOffset() + 1) + .setLastOffset(temp.lastOffset()) + .setDeliveryCount((short) 1)); + } + } else { + newAcquiredRecords.add(temp); + } + } + tempAcquiredRecords = newAcquiredRecords; + } + result.addAll(tempAcquiredRecords); + } + return result; + } finally { + lock.writeLock().unlock(); + } + } + + private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch> subMap, RecordBatch recordBatch) { + lock.writeLock().lock(); + try { + // The fetched batch either is exact fetch equivalent batch (mostly), subset + // or spans over multiple fetched batches. The state can vary per offset itself from + // the fetched batch in case of subset. + for (Map.Entry<Long, InFlightBatch> entry : subMap.entrySet()) { + InFlightBatch inFlightBatch = entry.getValue(); + + // If startOffset has moved ahead of the in-flight batch, skip the batch. + if (inFlightBatch.lastOffset() < startOffset) { + log.trace("All offsets in the inflight batch {} are already archived: {}-{}", + inFlightBatch, groupId, topicIdPartition); + continue; + } + + // Determine if the in-flight batch is a full match from the request batch. + boolean fullMatch = checkForFullMatch(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset()); + + // Maintain state per offset if the inflight batch is not a full match or the + // offset state is managed for this in-flight batch. + if (!fullMatch || inFlightBatch.offsetState() != null) { + log.debug("Subset or offset tracked batch record found for record," + + " batch: {}, request offsets - first: {}, last: {} for the share partition: {}-{}", + inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset(), groupId, topicIdPartition); + if (inFlightBatch.offsetState() == null) { + // The record batch is a subset and requires per offset state hence initialize + // the offsets state in the in-flight batch. + inFlightBatch.maybeInitializeOffsetStateUpdate(); + } + archivePerOffsetBatchRecords(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset(), RecordState.ACQUIRED); + continue; + } + // The in-flight batch is a full match hence change the state of the complete batch. + archiveCompleteBatch(inFlightBatch, RecordState.ACQUIRED); + } + } finally { + lock.writeLock().unlock(); + } + } + + /** + * This function fetches the sub map from cachedState where all the offset details present in the recordBatch can be referred to + * OR it gives an exception if those offsets are not present in cachedState. + * @param recordBatch The record batch for which we want to find the sub map. + * @return the sub map containing all the offset details. + */ + private NavigableMap<Long, InFlightBatch> fetchSubMapOrException(RecordBatch recordBatch) { + lock.readLock().lock(); + try { + Map.Entry<Long, InFlightBatch> floorEntry = cachedState.floorEntry(recordBatch.baseOffset()); + if (floorEntry == null) { + log.debug("Fetched batch record {} not found for share partition: {}-{}", recordBatch, groupId, + topicIdPartition); + throw new InvalidRecordStateException( + "Batch record not found. The request batch offsets are not found in the cache."); + } + return cachedState.subMap(floorEntry.getKey(), true, recordBatch.lastOffset(), true); + } finally { + lock.readLock().unlock(); + } + } + + // Visible for testing. + List<RecordBatch> fetchAbortedTransactionRecordBatches( + Iterable<? extends RecordBatch> batches, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + PriorityQueue<FetchResponseData.AbortedTransaction> orderedAbortedTransactions = orderedAbortedTransactions(abortedTransactions); + Set<Long> abortedProducerIds = new HashSet<>(); + List<RecordBatch> recordsToArchive = new ArrayList<>(); + + for (RecordBatch currentBatch : batches) { + if (currentBatch.hasProducerId()) { + // remove from the aborted transactions queue, all aborted transactions which have begun before the + // current batch's last offset and add the associated producerIds to the aborted producer set. + while (!orderedAbortedTransactions.isEmpty() && orderedAbortedTransactions.peek().firstOffset() <= currentBatch.lastOffset()) { + FetchResponseData.AbortedTransaction abortedTransaction = orderedAbortedTransactions.poll(); + abortedProducerIds.add(abortedTransaction.producerId()); + } + long producerId = currentBatch.producerId(); + if (containsAbortMarker(currentBatch)) { + abortedProducerIds.remove(producerId); + } else if (isBatchAborted(currentBatch, abortedProducerIds)) { + log.debug("Skipping aborted record batch for share partition: {}-{} with producerId {} and " + + "offsets {} to {}", groupId, topicIdPartition, producerId, currentBatch.baseOffset(), currentBatch.lastOffset()); + recordsToArchive.add(currentBatch); Review Comment: The code is not covered in tests, can we please add scenarios which can test the code. ########## core/src/main/scala/kafka/server/KafkaApis.scala: ########## @@ -3239,7 +3239,7 @@ class KafkaApis(val requestChannel: RequestChannel, shareFetchRequest.maxWait, fetchMinBytes, fetchMaxBytes, - FetchIsolation.HIGH_WATERMARK, + FetchIsolation.of(FetchRequest.CONSUMER_REPLICA_ID, GroupConfig.defaultShareIsolationLevel), Review Comment: Just FYI: there was a need to have `groupConfigManager` in KafkaApis as the final response is creted in KafkaApis hence I added it in following PR: https://github.com/apache/kafka/pull/19334/files. Once merged, we can move the code back here from SharePartitionManager. Sorry for the trouble. ########## core/src/main/java/kafka/server/share/SharePartition.java: ########## @@ -2484,6 +2497,241 @@ private long startOffsetDuringInitialization(long partitionDataStartOffset) thro } } + private ShareAcquiredRecords maybeFilterAbortedTransactionalAcquiredRecords( + FetchPartitionData fetchPartitionData, + FetchIsolation isolationLevel, + ShareAcquiredRecords shareAcquiredRecords + ) { + if (isolationLevel != FetchIsolation.TXN_COMMITTED || fetchPartitionData.abortedTransactions.isEmpty() || fetchPartitionData.abortedTransactions.get().isEmpty()) + return shareAcquiredRecords; + // When FetchIsolation.TXN_COMMITTED is used as isolation level by the share group, we need to filter any + // transactions that were aborted/did not commit due to timeout. + List<AcquiredRecords> result = filterAbortedTransactionalAcquiredRecords(fetchPartitionData.records.batches(), shareAcquiredRecords.acquiredRecords(), fetchPartitionData.abortedTransactions.get()); + int acquiredCount = 0; + for (AcquiredRecords records : result) { + acquiredCount += (int) (records.lastOffset() - records.firstOffset() + 1); + } + return new ShareAcquiredRecords(result, acquiredCount); + } + + private List<AcquiredRecords> filterAbortedTransactionalAcquiredRecords( + Iterable<? extends RecordBatch> batches, + List<AcquiredRecords> acquiredRecords, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + lock.writeLock().lock(); + try { + // The record batches that need to be archived in cachedState because they were a part of aborted transactions. + List<RecordBatch> recordsToArchive = fetchAbortedTransactionRecordBatches(batches, abortedTransactions); + for (RecordBatch recordBatch : recordsToArchive) { + // Archive the offsets/batches in the cached state. + NavigableMap<Long, InFlightBatch> subMap = fetchSubMap(recordBatch); + archiveAcquiredBatchRecords(subMap, recordBatch); + } + return filterRecordBatchesFromAcquiredRecords(acquiredRecords, recordsToArchive); + } finally { + lock.writeLock().unlock(); + } + } + + // Visible for testing. + List<AcquiredRecords> filterRecordBatchesFromAcquiredRecords( + List<AcquiredRecords> acquiredRecords, + List<RecordBatch> recordsToArchive + ) { + lock.writeLock().lock(); + try { + List<AcquiredRecords> result = new ArrayList<>(); + + for (AcquiredRecords acquiredRecord : acquiredRecords) { + List<AcquiredRecords> tempAcquiredRecords = new ArrayList<>(); + tempAcquiredRecords.add(acquiredRecord); + for (RecordBatch recordBatch : recordsToArchive) { + List<AcquiredRecords> newAcquiredRecords = new ArrayList<>(); + for (AcquiredRecords temp : tempAcquiredRecords) { + // Check if record batch overlaps with the acquired records. + if (temp.firstOffset() <= recordBatch.lastOffset() && temp.lastOffset() >= recordBatch.baseOffset()) { + // Split the acquired record into parts before, inside, and after the overlapping record batch. + if (temp.firstOffset() < recordBatch.baseOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(temp.firstOffset()) + .setLastOffset(recordBatch.baseOffset() - 1) + .setDeliveryCount((short) 1)); + } + if (temp.lastOffset() > recordBatch.lastOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(recordBatch.lastOffset() + 1) + .setLastOffset(temp.lastOffset()) + .setDeliveryCount((short) 1)); + } + } else { + newAcquiredRecords.add(temp); + } + } + tempAcquiredRecords = newAcquiredRecords; + } + result.addAll(tempAcquiredRecords); + } + return result; + } finally { + lock.writeLock().unlock(); + } + } + + private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch> subMap, RecordBatch recordBatch) { Review Comment: Return types can be ignored. I re-compared both and still thinks that we should merge. The comparison of startOffset and endOffset seems relevant in both methods and we should have them. Let me know if I am missing something. ``` private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch> subMap, RecordBatch recordBatch) { archiveAvailableRecords(recordBatch.firstOffset(), recordBatch.lastOffset() + 1, subMap, Acquired); ``` ``` private boolean archiveAvailableRecords(long startOffset, long endOffset, NavigableMap<Long, InFlightBatch> map, RecordState initialState) ``` ``` move archiveAvailableRecords => archiveRecords ``` ########## core/src/main/java/kafka/server/share/SharePartition.java: ########## @@ -2484,6 +2497,241 @@ private long startOffsetDuringInitialization(long partitionDataStartOffset) thro } } + private ShareAcquiredRecords maybeFilterAbortedTransactionalAcquiredRecords( + FetchPartitionData fetchPartitionData, + FetchIsolation isolationLevel, + ShareAcquiredRecords shareAcquiredRecords + ) { + if (isolationLevel != FetchIsolation.TXN_COMMITTED || fetchPartitionData.abortedTransactions.isEmpty() || fetchPartitionData.abortedTransactions.get().isEmpty()) + return shareAcquiredRecords; + // When FetchIsolation.TXN_COMMITTED is used as isolation level by the share group, we need to filter any + // transactions that were aborted/did not commit due to timeout. + List<AcquiredRecords> result = filterAbortedTransactionalAcquiredRecords(fetchPartitionData.records.batches(), shareAcquiredRecords.acquiredRecords(), fetchPartitionData.abortedTransactions.get()); + int acquiredCount = 0; + for (AcquiredRecords records : result) { + acquiredCount += (int) (records.lastOffset() - records.firstOffset() + 1); + } + return new ShareAcquiredRecords(result, acquiredCount); + } + + private List<AcquiredRecords> filterAbortedTransactionalAcquiredRecords( + Iterable<? extends RecordBatch> batches, + List<AcquiredRecords> acquiredRecords, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + lock.writeLock().lock(); + try { + // The record batches that need to be archived in cachedState because they were a part of aborted transactions. + List<RecordBatch> recordsToArchive = fetchAbortedTransactionRecordBatches(batches, abortedTransactions); + for (RecordBatch recordBatch : recordsToArchive) { + // Archive the offsets/batches in the cached state. + NavigableMap<Long, InFlightBatch> subMap = fetchSubMap(recordBatch); + archiveAcquiredBatchRecords(subMap, recordBatch); + } + return filterRecordBatchesFromAcquiredRecords(acquiredRecords, recordsToArchive); + } finally { + lock.writeLock().unlock(); + } + } + + // Visible for testing. + List<AcquiredRecords> filterRecordBatchesFromAcquiredRecords( + List<AcquiredRecords> acquiredRecords, + List<RecordBatch> recordsToArchive + ) { + lock.writeLock().lock(); + try { + List<AcquiredRecords> result = new ArrayList<>(); + + for (AcquiredRecords acquiredRecord : acquiredRecords) { + List<AcquiredRecords> tempAcquiredRecords = new ArrayList<>(); + tempAcquiredRecords.add(acquiredRecord); + for (RecordBatch recordBatch : recordsToArchive) { + List<AcquiredRecords> newAcquiredRecords = new ArrayList<>(); + for (AcquiredRecords temp : tempAcquiredRecords) { + // Check if record batch overlaps with the acquired records. + if (temp.firstOffset() <= recordBatch.lastOffset() && temp.lastOffset() >= recordBatch.baseOffset()) { + // Split the acquired record into parts before, inside, and after the overlapping record batch. + if (temp.firstOffset() < recordBatch.baseOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(temp.firstOffset()) + .setLastOffset(recordBatch.baseOffset() - 1) + .setDeliveryCount((short) 1)); + } + if (temp.lastOffset() > recordBatch.lastOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(recordBatch.lastOffset() + 1) + .setLastOffset(temp.lastOffset()) + .setDeliveryCount((short) 1)); + } + } else { + newAcquiredRecords.add(temp); + } + } + tempAcquiredRecords = newAcquiredRecords; + } + result.addAll(tempAcquiredRecords); + } + return result; + } finally { + lock.writeLock().unlock(); + } + } + + private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch> subMap, RecordBatch recordBatch) { + lock.writeLock().lock(); + try { + // The fetched batch either is exact fetch equivalent batch (mostly), subset + // or spans over multiple fetched batches. The state can vary per offset itself from + // the fetched batch in case of subset. + for (Map.Entry<Long, InFlightBatch> entry : subMap.entrySet()) { + InFlightBatch inFlightBatch = entry.getValue(); + + // If startOffset has moved ahead of the in-flight batch, skip the batch. + if (inFlightBatch.lastOffset() < startOffset) { + log.trace("All offsets in the inflight batch {} are already archived: {}-{}", + inFlightBatch, groupId, topicIdPartition); + continue; + } + + // Determine if the in-flight batch is a full match from the request batch. + boolean fullMatch = checkForFullMatch(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset()); + + // Maintain state per offset if the inflight batch is not a full match or the + // offset state is managed for this in-flight batch. + if (!fullMatch || inFlightBatch.offsetState() != null) { + log.debug("Subset or offset tracked batch record found for record," + + " batch: {}, request offsets - first: {}, last: {} for the share partition: {}-{}", + inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset(), groupId, topicIdPartition); + if (inFlightBatch.offsetState() == null) { + // The record batch is a subset and requires per offset state hence initialize + // the offsets state in the in-flight batch. + inFlightBatch.maybeInitializeOffsetStateUpdate(); + } + archivePerOffsetAcquiredBatchRecords(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset()); + continue; + } + // The in-flight batch is a full match hence change the state of the complete batch. + archiveCompleteAcquiredBatch(inFlightBatch); + } + } finally { + lock.writeLock().unlock(); + } + } + + private void archivePerOffsetAcquiredBatchRecords(InFlightBatch inFlightBatch, long startOffsetToArchive, long endOffsetToArchive) { + lock.writeLock().lock(); + try { + log.trace("Archiving offset tracked batch: {} for the share partition: {}-{} since it was a part of aborted transaction", inFlightBatch, groupId, topicIdPartition); + for (Map.Entry<Long, InFlightState> offsetState : inFlightBatch.offsetState().entrySet()) { + if (offsetState.getKey() < startOffsetToArchive) { + continue; + } + if (offsetState.getKey() > endOffsetToArchive) { + // No further offsets to process. + break; + } + if (offsetState.getValue().state != RecordState.ACQUIRED) { + continue; + } + offsetState.getValue().archive(EMPTY_MEMBER_ID); + offsetState.getValue().cancelAndClearAcquisitionLockTimeoutTask(); + } + } finally { + lock.writeLock().unlock(); + } + } + + private void archiveCompleteAcquiredBatch(InFlightBatch inFlightBatch) { + lock.writeLock().lock(); + try { + log.trace("Archiving complete batch: {} for the share partition: {}-{} since it was a part of aborted transaction", inFlightBatch, groupId, topicIdPartition); + if (inFlightBatch.batchState() == RecordState.ACQUIRED) { + // Change the state of complete batch since the same state exists for the entire inFlight batch. + inFlightBatch.archiveBatch(EMPTY_MEMBER_ID); + inFlightBatch.batchState.cancelAndClearAcquisitionLockTimeoutTask(); + } + } finally { + lock.writeLock().unlock(); + } + } + + private NavigableMap<Long, InFlightBatch> fetchSubMap(RecordBatch recordBatch) { + lock.writeLock().lock(); + try { + Map.Entry<Long, InFlightBatch> floorOffset = cachedState.floorEntry(recordBatch.baseOffset()); + if (floorOffset == null) { + log.debug("Fetched batch record {} not found for share partition: {}-{}", recordBatch, groupId, + topicIdPartition); + throw new InvalidRecordStateException( + "Batch record not found. The request batch offsets are not found in the cache."); Review Comment: `IllegalStateException` might be better as this condition should not hit. ########## core/src/main/java/kafka/server/share/SharePartition.java: ########## @@ -1241,7 +1254,8 @@ private boolean archiveAvailableRecords(long startOffset, long endOffset, Naviga private boolean archivePerOffsetBatchRecords(InFlightBatch inFlightBatch, long startOffsetToArchive, - long endOffsetToArchive) { + long endOffsetToArchive, + RecordState initialState) { Review Comment: ```suggestion RecordState initialState ) { ``` ########## core/src/main/java/kafka/server/share/SharePartition.java: ########## @@ -2484,6 +2504,205 @@ private long startOffsetDuringInitialization(long partitionDataStartOffset) thro } } + private ShareAcquiredRecords maybeFilterAbortedTransactionalAcquiredRecords( + FetchPartitionData fetchPartitionData, + FetchIsolation isolationLevel, + ShareAcquiredRecords shareAcquiredRecords + ) { + if (isolationLevel != FetchIsolation.TXN_COMMITTED || fetchPartitionData.abortedTransactions.isEmpty() || fetchPartitionData.abortedTransactions.get().isEmpty()) + return shareAcquiredRecords; + // When FetchIsolation.TXN_COMMITTED is used as isolation level by the share group, we need to filter any + // transactions that were aborted/did not commit due to timeout. + List<AcquiredRecords> result = filterAbortedTransactionalAcquiredRecords(fetchPartitionData.records.batches(), + shareAcquiredRecords.acquiredRecords(), fetchPartitionData.abortedTransactions.get()); + int acquiredCount = 0; + for (AcquiredRecords records : result) { + acquiredCount += (int) (records.lastOffset() - records.firstOffset() + 1); + } + return new ShareAcquiredRecords(result, acquiredCount); + } + + private List<AcquiredRecords> filterAbortedTransactionalAcquiredRecords( + Iterable<? extends RecordBatch> batches, + List<AcquiredRecords> acquiredRecords, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + lock.writeLock().lock(); + try { + // The record batches that need to be archived in cachedState because they were a part of aborted transactions. + List<RecordBatch> recordsToArchive = fetchAbortedTransactionRecordBatches(batches, abortedTransactions); + for (RecordBatch recordBatch : recordsToArchive) { + // Archive the offsets/batches in the cached state. + NavigableMap<Long, InFlightBatch> subMap = fetchSubMapOrException(recordBatch); + archiveAcquiredBatchRecords(subMap, recordBatch); + } + return filterRecordBatchesFromAcquiredRecords(acquiredRecords, recordsToArchive); + } finally { + lock.writeLock().unlock(); + } + } + + // Visible for testing. + List<AcquiredRecords> filterRecordBatchesFromAcquiredRecords( + List<AcquiredRecords> acquiredRecords, + List<RecordBatch> recordsToArchive + ) { + lock.writeLock().lock(); + try { + List<AcquiredRecords> result = new ArrayList<>(); + + for (AcquiredRecords acquiredRecord : acquiredRecords) { + List<AcquiredRecords> tempAcquiredRecords = new ArrayList<>(); + tempAcquiredRecords.add(acquiredRecord); + for (RecordBatch recordBatch : recordsToArchive) { + List<AcquiredRecords> newAcquiredRecords = new ArrayList<>(); + for (AcquiredRecords temp : tempAcquiredRecords) { + // Check if record batch overlaps with the acquired records. + if (temp.firstOffset() <= recordBatch.lastOffset() && temp.lastOffset() >= recordBatch.baseOffset()) { + // Split the acquired record into parts before, inside, and after the overlapping record batch. + if (temp.firstOffset() < recordBatch.baseOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(temp.firstOffset()) + .setLastOffset(recordBatch.baseOffset() - 1) + .setDeliveryCount((short) 1)); + } + if (temp.lastOffset() > recordBatch.lastOffset()) { + newAcquiredRecords.add(new AcquiredRecords() + .setFirstOffset(recordBatch.lastOffset() + 1) + .setLastOffset(temp.lastOffset()) + .setDeliveryCount((short) 1)); + } + } else { + newAcquiredRecords.add(temp); + } + } + tempAcquiredRecords = newAcquiredRecords; + } + result.addAll(tempAcquiredRecords); + } + return result; + } finally { + lock.writeLock().unlock(); + } + } + + private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch> subMap, RecordBatch recordBatch) { + lock.writeLock().lock(); + try { + // The fetched batch either is exact fetch equivalent batch (mostly), subset + // or spans over multiple fetched batches. The state can vary per offset itself from + // the fetched batch in case of subset. + for (Map.Entry<Long, InFlightBatch> entry : subMap.entrySet()) { + InFlightBatch inFlightBatch = entry.getValue(); + + // If startOffset has moved ahead of the in-flight batch, skip the batch. + if (inFlightBatch.lastOffset() < startOffset) { + log.trace("All offsets in the inflight batch {} are already archived: {}-{}", + inFlightBatch, groupId, topicIdPartition); + continue; + } + + // Determine if the in-flight batch is a full match from the request batch. + boolean fullMatch = checkForFullMatch(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset()); + + // Maintain state per offset if the inflight batch is not a full match or the + // offset state is managed for this in-flight batch. + if (!fullMatch || inFlightBatch.offsetState() != null) { + log.debug("Subset or offset tracked batch record found for record," + + " batch: {}, request offsets - first: {}, last: {} for the share partition: {}-{}", + inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset(), groupId, topicIdPartition); + if (inFlightBatch.offsetState() == null) { + // The record batch is a subset and requires per offset state hence initialize + // the offsets state in the in-flight batch. + inFlightBatch.maybeInitializeOffsetStateUpdate(); + } + archivePerOffsetBatchRecords(inFlightBatch, recordBatch.baseOffset(), recordBatch.lastOffset(), RecordState.ACQUIRED); + continue; + } + // The in-flight batch is a full match hence change the state of the complete batch. + archiveCompleteBatch(inFlightBatch, RecordState.ACQUIRED); + } + } finally { + lock.writeLock().unlock(); + } + } + + /** + * This function fetches the sub map from cachedState where all the offset details present in the recordBatch can be referred to + * OR it gives an exception if those offsets are not present in cachedState. + * @param recordBatch The record batch for which we want to find the sub map. + * @return the sub map containing all the offset details. + */ + private NavigableMap<Long, InFlightBatch> fetchSubMapOrException(RecordBatch recordBatch) { + lock.readLock().lock(); + try { + Map.Entry<Long, InFlightBatch> floorEntry = cachedState.floorEntry(recordBatch.baseOffset()); + if (floorEntry == null) { + log.debug("Fetched batch record {} not found for share partition: {}-{}", recordBatch, groupId, + topicIdPartition); + throw new InvalidRecordStateException( + "Batch record not found. The request batch offsets are not found in the cache."); + } + return cachedState.subMap(floorEntry.getKey(), true, recordBatch.lastOffset(), true); + } finally { + lock.readLock().unlock(); + } + } + + // Visible for testing. + List<RecordBatch> fetchAbortedTransactionRecordBatches( + Iterable<? extends RecordBatch> batches, + List<FetchResponseData.AbortedTransaction> abortedTransactions + ) { + PriorityQueue<FetchResponseData.AbortedTransaction> orderedAbortedTransactions = orderedAbortedTransactions(abortedTransactions); + Set<Long> abortedProducerIds = new HashSet<>(); + List<RecordBatch> recordsToArchive = new ArrayList<>(); + + for (RecordBatch currentBatch : batches) { + if (currentBatch.hasProducerId()) { + // remove from the aborted transactions queue, all aborted transactions which have begun before the + // current batch's last offset and add the associated producerIds to the aborted producer set. + while (!orderedAbortedTransactions.isEmpty() && orderedAbortedTransactions.peek().firstOffset() <= currentBatch.lastOffset()) { + FetchResponseData.AbortedTransaction abortedTransaction = orderedAbortedTransactions.poll(); + abortedProducerIds.add(abortedTransaction.producerId()); + } + long producerId = currentBatch.producerId(); + if (containsAbortMarker(currentBatch)) { + abortedProducerIds.remove(producerId); + } else if (isBatchAborted(currentBatch, abortedProducerIds)) { + log.debug("Skipping aborted record batch for share partition: {}-{} with producerId {} and " + + "offsets {} to {}", groupId, topicIdPartition, producerId, currentBatch.baseOffset(), currentBatch.lastOffset()); + recordsToArchive.add(currentBatch); + } + } + } + return recordsToArchive; + } + + private PriorityQueue<FetchResponseData.AbortedTransaction> orderedAbortedTransactions(List<FetchResponseData.AbortedTransaction> abortedTransactions) { + PriorityQueue<FetchResponseData.AbortedTransaction> orderedAbortedTransactions = new PriorityQueue<>( + abortedTransactions.size(), Comparator.comparingLong(FetchResponseData.AbortedTransaction::firstOffset) + ); + orderedAbortedTransactions.addAll(abortedTransactions); + return orderedAbortedTransactions; + } + + private boolean isBatchAborted(RecordBatch batch, Set<Long> abortedProducerIds) { + return batch.isTransactional() && abortedProducerIds.contains(batch.producerId()); + } + + private boolean containsAbortMarker(RecordBatch batch) { + if (!batch.isControlBatch()) + return false; + + Iterator<Record> batchIterator = batch.iterator(); + if (!batchIterator.hasNext()) + return false; + + Record firstRecord = batchIterator.next(); + return ControlRecordType.ABORT == ControlRecordType.parse(firstRecord.key()); + } Review Comment: Are there scenrios which tests these methods? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org