yuxiqian commented on code in PR #4279: URL: https://github.com/apache/flink-cdc/pull/4279#discussion_r2909159685
########## flink-cdc-runtime/src/main/java/org/apache/flink/cdc/runtime/operators/transform/TransformSchemaChangeUtils.java: ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.runtime.operators.transform; + +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.utils.SchemaUtils; + +import org.slf4j.Logger; + +import java.util.Optional; + +/** + * Shared utility methods for filtering schema change events in both {@link PreTransformOperator} + * and {@link PostTransformOperator}. + */ +class TransformSchemaChangeUtils { + + /** + * Filters duplicate {@link AddColumnEvent} columns that already exist in the given schema. For + * non-AddColumnEvent schema changes, the event is returned as-is. + * + * @param currentSchema the current schema to check against + * @param event the schema change event to filter + * @param log the logger to use for debug messages + * @return the filtered event, or {@link Optional#empty()} if the event is fully redundant + */ + static Optional<SchemaChangeEvent> filterDuplicateAddColumns( + Schema currentSchema, SchemaChangeEvent event, Logger log) { Review Comment: It's unusual to pass the `Logger` as argument. ########## flink-cdc-runtime/src/test/java/org/apache/flink/cdc/runtime/operators/transform/TransformOperatorWithSchemaEvolveTest.java: ########## @@ -840,6 +840,262 @@ void testSchemaChangeWithPostWildcard() throws Exception { .runTests("inserting columns at last"); } + /** + * Tests duplicate AddColumnEvent handling with a wildcard projection ({@code *, id + age as + * computed}) and a filter ({@code name <> 'Alice'}). This exercises the full pre+post pipeline + * with a wildcard rule: the first AddColumnEvent for "extras" passes through both operators, + * while the duplicate is filtered by PreTransformOperator (which also prevents it from reaching + * PostTransformOperator). A subsequent DataChangeEvent verifies the pipeline remains functional + * after duplicate filtering, with the computed column correctly evaluated. + */ + @Test + void testDuplicateAddColumnEventPreTransform() throws Exception { Review Comment: Please also add an ITCase or E2e test in `flink-cdc-composer`. ########## flink-cdc-runtime/src/main/java/org/apache/flink/cdc/runtime/operators/transform/TransformSchemaChangeUtils.java: ########## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.runtime.operators.transform; + +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.utils.SchemaUtils; + +import org.slf4j.Logger; + +import java.util.Optional; + +/** + * Shared utility methods for filtering schema change events in both {@link PreTransformOperator} + * and {@link PostTransformOperator}. + */ +class TransformSchemaChangeUtils { Review Comment: Is it worth creating a new Utils class for merely such a method? Putting it in `SchemaUtils` makes more sense. ########## flink-cdc-common/src/main/java/org/apache/flink/cdc/common/utils/SchemaUtils.java: ########## @@ -120,7 +126,35 @@ public static Schema applySchemaChangeEvent(Schema schema, SchemaChangeEvent eve private static Schema applyAddColumnEvent(AddColumnEvent event, Schema oldSchema) { LinkedList<Column> columns = new LinkedList<>(oldSchema.getColumns()); + Set<String> existingColumnNames = + columns.stream() + .map(Column::getName) + .collect(Collectors.toCollection(HashSet::new)); for (AddColumnEvent.ColumnWithPosition columnWithPosition : event.getAddedColumns()) { + // Skip columns that already exist in the schema to handle duplicate AddColumnEvents + // (e.g., from gh-ost online schema migrations) + if (existingColumnNames.contains(columnWithPosition.getAddColumn().getName())) { + Column incomingColumn = columnWithPosition.getAddColumn(); + columns.stream() + .filter(c -> c.getName().equals(incomingColumn.getName())) + .findFirst() + .ifPresent( + existingColumn -> { + if (!existingColumn + .getType() + .equals(incomingColumn.getType())) { Review Comment: What will happen if existing column definition and incoming definition is incompatible? Will there be coercions or implicit casting? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
