Hisoka-X commented on code in PR #8840: URL: https://github.com/apache/seatunnel/pull/8840#discussion_r1976836955
########## seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/compaction/IcebergCompactionHandler.java: ########## @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.compaction; + +import org.apache.seatunnel.shade.com.google.common.collect.Iterables; +import org.apache.seatunnel.shade.com.google.common.collect.Lists; + +import org.apache.seatunnel.api.source.Collector; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.RecordWriter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.WriteResult; +import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; + +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.RewriteFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.exceptions.CommitStateUnknownException; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.util.Tasks; + +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +@Slf4j +public class IcebergCompactionHandler { + + public static final String COMPACTION_ACTION_MESSAGE = "compaction_end"; + public static final String CURRENT_SNAPSHOT_ID = "current_snapshot_id"; + public static final String DELETED_FILES = "deleted_files"; + + private final Table table; + private final FileIO fileIO; + private final RecordWriter writer; + + public IcebergCompactionHandler(IcebergTableLoader icebergTableLoader, RecordWriter writer) { + this.table = icebergTableLoader.loadTable(); + this.fileIO = this.table.io(); + this.writer = writer; + } + + public static void emitCompactionEndRecord( + Collector<SeaTunnelRow> output, List<IcebergFileScanTaskSplit> finishedSplits) { + if (finishedSplits.isEmpty()) { + log.warn("No compaction split, skip it."); + return; + } + List<String> deletedFiles = + finishedSplits.stream() + .map(split -> split.getTask().file().path().toString()) + .collect(Collectors.toList()); + + Optional<IcebergFileScanTaskSplit> firstSplit = finishedSplits.stream().findFirst(); + if (!firstSplit.isPresent()) { + log.warn("Failed to find any splits, skipping compaction end record emission."); + return; + } + IcebergFileScanTaskSplit oneSplit = firstSplit.get(); + SeaTunnelRow compactionEndRecord = new SeaTunnelRow(0); + compactionEndRecord.setTableId(oneSplit.getTablePath().getFullName()); + + Map<String, Object> options = new HashMap<>(); + options.put(COMPACTION_ACTION_MESSAGE, COMPACTION_ACTION_MESSAGE); + options.put(CURRENT_SNAPSHOT_ID, oneSplit.getCurrentSnapshotId()); + options.put(DELETED_FILES, deletedFiles); + compactionEndRecord.setOptions(options); + output.collect(compactionEndRecord); Review Comment: So only when source and sink both are iceberg then SeaTunnel can support file compaction? ########## docs/en/connector-v2/source/Iceberg.md: ########## @@ -71,25 +71,26 @@ libfb303-xxx.jar ## Source Options -| Name | Type | Required | Default | Description | -|--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| catalog_name | string | yes | - | User-specified catalog name. | -| namespace | string | yes | - | The iceberg database name in the backend catalog. | -| table | string | no | - | The iceberg table name in the backend catalog. | -| table_list | string | no | - | The iceberg table list in the backend catalog. | -| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | -| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | -| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | -| schema | config | no | - | Use projection to select data columns and columns order. | -| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | -| start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp. <br/>timestamp – the timestamp in millis since the Unix epoch | -| start_snapshot_id | long | no | - | Instructs this scan to look for changes starting from a particular snapshot (exclusive). | -| end_snapshot_id | long | no | - | Instructs this scan to look for changes up to a particular snapshot (inclusive). | -| use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. | -| use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch | -| stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are:<br/>TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.<br/>FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.<br/>FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.<br/>FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.<br/>FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. | -| increment.scan-interval | long | no | 2000 | The interval of increment scan(mills) | -| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | +| Name | Type | Required | Default | Description | +|--------------------------|----------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| catalog_name | string | yes | - | User-specified catalog name. | +| namespace | string | yes | - | The iceberg database name in the backend catalog. | +| table | string | no | - | The iceberg table name in the backend catalog. | +| table_list | string | no | - | The iceberg table list in the backend catalog. | +| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | +| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | +| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | +| schema | config | no | - | Use projection to select data columns and columns order. | +| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | +| start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp. <br/>timestamp – the timestamp in millis since the Unix epoch | +| start_snapshot_id | long | no | - | Instructs this scan to look for changes starting from a particular snapshot (exclusive). | +| end_snapshot_id | long | no | - | Instructs this scan to look for changes up to a particular snapshot (inclusive). | +| use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. | +| use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch | +| stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are:<br/>TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.<br/>FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.<br/>FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.<br/>FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.<br/>FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. | +| increment.scan-interval | long | no | 2000 | The interval of increment scan(mills) | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | +| compaction_action | boolean | no | false | Whether to enable the Compaction operation on the Iceberg table. Compaction is an optimization operation that merges small files into larger ones to improve read performance. When set to `true`, the Iceberg connector will perform Compaction during data writes. The default value is `false`, meaning Compaction is not enabled. | Review Comment: Hi @sunxiaojian , why we can set `compaction_action` during read from iceberg? Would it do file compaction even read from iceberg? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@seatunnel.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org