rovboyko commented on code in PR #26313: URL: https://github.com/apache/flink/pull/26313#discussion_r2115155616
########## flink-table/flink-table-runtime/src/main/java/org/apache/flink/table/runtime/operators/join/stream/state/MultiJoinStateViews.java: ########## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.table.runtime.operators.join.stream.state; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.state.MapState; +import org.apache.flink.api.common.state.MapStateDescriptor; +import org.apache.flink.api.common.state.StateTtlConfig; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.operators.join.stream.utils.JoinInputSideSpec; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.IterableIterator; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import static org.apache.flink.table.runtime.util.StateConfigUtil.createTtlConfig; +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Factory class to create different implementations of {@link MultiJoinStateView} based on the + * characteristics described in {@link JoinInputSideSpec}. + * + * <p>Each state view uses a {@link MapState} where the primary key is the `mapKey` derived from the + * join conditions (via {@link + * org.apache.flink.table.runtime.operators.join.stream.keyselector.JoinKeyExtractor}). The value + * stored within this map depends on whether the input side has a unique key and how it relates to + * the join key, optimizing storage and access patterns. + */ +public final class MultiJoinStateViews { + + /** Creates a {@link MultiJoinStateView} depends on {@link JoinInputSideSpec}. */ + public static MultiJoinStateView create( + RuntimeContext ctx, + String stateName, + JoinInputSideSpec inputSideSpec, + InternalTypeInfo<RowData> mapKeyType, // Type info for the outer map key + InternalTypeInfo<RowData> recordType, + long retentionTime) { + StateTtlConfig ttlConfig = createTtlConfig(retentionTime); + + if (inputSideSpec.hasUniqueKey()) { + if (inputSideSpec.joinKeyContainsUniqueKey()) { + return new JoinKeyContainsUniqueKey( + ctx, stateName, mapKeyType, recordType, ttlConfig); + } else { + return new InputSideHasUniqueKey( + ctx, + stateName, + mapKeyType, + recordType, + inputSideSpec.getUniqueKeyType(), + inputSideSpec.getUniqueKeySelector(), + ttlConfig); + } + } else { + return new InputSideHasNoUniqueKey(ctx, stateName, mapKeyType, recordType, ttlConfig); + } + } + + /** + * Creates a {@link MapStateDescriptor} with the given parameters and applies TTL configuration. + * + * @param <K> Key type + * @param <V> Value type + * @param stateName Unique name for the state + * @param keyTypeInfo Type information for the key + * @param valueTypeInfo Type information for the value + * @param ttlConfig State TTL configuration + * @return Configured MapStateDescriptor + */ + private static <K, V> MapStateDescriptor<K, V> createStateDescriptor( + String stateName, + TypeInformation<K> keyTypeInfo, + TypeInformation<V> valueTypeInfo, + StateTtlConfig ttlConfig) { + MapStateDescriptor<K, V> descriptor = + new MapStateDescriptor<>(stateName, keyTypeInfo, valueTypeInfo); + if (ttlConfig.isEnabled()) { + descriptor.enableTimeToLive(ttlConfig); + } + return descriptor; + } + + // ------------------------------------------------------------------------------------ + // Multi Join State View Implementations + // ------------------------------------------------------------------------------------ + + /** + * State view for input sides where the unique key is fully contained within the join key. + * + * <p>Stores data as {@code MapState<MapKey, Record>}. + */ + private static final class JoinKeyContainsUniqueKey implements MultiJoinStateView { + + // stores record in the mapping <MapKey, Record> + private final MapState<RowData, RowData> recordState; + private final List<RowData> reusedList; + + private JoinKeyContainsUniqueKey( + RuntimeContext ctx, + final String stateName, + final InternalTypeInfo<RowData> mapKeyType, + final InternalTypeInfo<RowData> recordType, + final StateTtlConfig ttlConfig) { + + MapStateDescriptor<RowData, RowData> recordStateDesc = + createStateDescriptor(stateName, mapKeyType, recordType, ttlConfig); + + this.recordState = ctx.getMapState(recordStateDesc); + // the result records always not more than 1 per mapKey + this.reusedList = new ArrayList<>(1); + } + + @Override + public void addRecord(RowData mapKey, RowData record) throws Exception { + recordState.put(mapKey, record); + } + + @Override + public void retractRecord(RowData mapKey, RowData record) throws Exception { + // Only one record is kept per mapKey, remove it directly. + recordState.remove(mapKey); + } + + @Override + public Iterable<RowData> getRecords(RowData mapKey) throws Exception { + reusedList.clear(); + RowData record = recordState.get(mapKey); + if (record != null) { + reusedList.add(record); + } + return reusedList; + } + + @Override + public void cleanup(RowData mapKey) throws Exception { + recordState.remove(mapKey); + } + } + + /** + * State view for input sides that have a unique key, but it differs from the join key. + * + * <p>Stores data as {@code MapState<MapKey, Map<UK, Record>>}. + */ + private static final class InputSideHasUniqueKey implements MultiJoinStateView { + + // stores map in the mapping <MapKey, Map<UK, Record>> + private final MapState<RowData, Map<RowData, RowData>> recordState; + private final KeySelector<RowData, RowData> uniqueKeySelector; + + private InputSideHasUniqueKey( + RuntimeContext ctx, + final String stateName, + final InternalTypeInfo<RowData> mapKeyType, + final InternalTypeInfo<RowData> recordType, + final InternalTypeInfo<RowData> uniqueKeyType, + final KeySelector<RowData, RowData> uniqueKeySelector, + final StateTtlConfig ttlConfig) { + checkNotNull(uniqueKeyType); + checkNotNull(uniqueKeySelector); + this.uniqueKeySelector = uniqueKeySelector; + + TypeInformation<Map<RowData, RowData>> mapValueTypeInfo = + Types.MAP(uniqueKeyType, recordType); // UK is the key in the inner map + + MapStateDescriptor<RowData, Map<RowData, RowData>> recordStateDesc = + createStateDescriptor(stateName, mapKeyType, mapValueTypeInfo, ttlConfig); + + this.recordState = ctx.getMapState(recordStateDesc); + } + + @Override + public void addRecord(RowData mapKey, RowData record) throws Exception { + RowData uniqueKey = uniqueKeySelector.getKey(record); + Map<RowData, RowData> uniqueKeyToRecordMap = recordState.get(mapKey); + if (uniqueKeyToRecordMap == null) { + uniqueKeyToRecordMap = new HashMap<>(); + } + uniqueKeyToRecordMap.put(uniqueKey, record); + recordState.put(mapKey, uniqueKeyToRecordMap); + } + + @Override + public void retractRecord(RowData mapKey, RowData record) throws Exception { + RowData uniqueKey = uniqueKeySelector.getKey(record); + Map<RowData, RowData> uniqueKeyToRecordMap = recordState.get(mapKey); + if (uniqueKeyToRecordMap != null) { + uniqueKeyToRecordMap.remove(uniqueKey); + if (uniqueKeyToRecordMap.isEmpty()) { + // Clean up the entry for mapKey if the inner map becomes empty + recordState.remove(mapKey); + } else { + recordState.put(mapKey, uniqueKeyToRecordMap); + } + } + // ignore uniqueKeyToRecordMap == null + } + + @Override + public Iterable<RowData> getRecords(RowData mapKey) throws Exception { + Map<RowData, RowData> uniqueKeyToRecordMap = recordState.get(mapKey); + if (uniqueKeyToRecordMap == null) { + return Collections.emptyList(); + } else { + // Return the values (records) from the inner map + return uniqueKeyToRecordMap.values(); + } + } + + @Override + public void cleanup(RowData mapKey) throws Exception { + recordState.remove(mapKey); + } + } + + /** + * State view for input sides that do not have a unique key (multi-set semantics). + * + * <p>Stores data as {@code MapState<MapKey, Map<Record, Count>>}. + */ + private static final class InputSideHasNoUniqueKey implements MultiJoinStateView { + + // stores map in the mapping <MapKey, Map<Record, Count>> + private final MapState<RowData, Map<RowData, Integer>> recordState; Review Comment: Hi @gustavodemorais ! Thank you so much for your reply and explanation! > It's not necessary to get all the data from T2. Yes, I didn't mean we need to get **all** data from T2, but only all associated with K1 records. > For example, for SELECT * FROM T1 JOIN T2 on T1.k1 = T2.k1 AND T1.k98 = T2.k98 JOIN T3 on T2.k1 = T3.k1 AND T2.k99 == T3.k99 we could start at any position. In this example: If are receiving input record from T1 and starting from T3 then you can't filter the records on T2.k99 == T3.k99 condition because you don't have the k99 inside T1. So the states requesting order changing won't work. For the same reason you won't have an ability to request multiple states in parallel. > Let me give a classic example SELECT * FROM users LEFT JOIN orders on users.id = orders.uid LEFT JOIN payment orders.uid = payment.uid AND orders.id = payment.oid -> this would already not be supported and it's only a 3 way join! Yes, I agree with you. But this is only the question of DB normalization. In normalized DB you won't have field `payment.uid` in table `payment`. And of course if you will join 5,10,15 tables you most probably will not have a even a CommonJoinKey among all tables. And from my point of view - the only DB schema where the Multi Way Join can be implemented is `star` schema. I don't try to say that using only one JoinKey as a multiWayJoin condition is enough for many cases. But I think that using commonJoinKey instead is not much wider. > All that said, when we have a first e2e working version, we can check real benchmark numbers to see the impact of this and if there's a significant optimization we won't be able to implement. If that's the case, I think the community would be happy to receive one more operator contribution/optimizations. The optimizer could identify the specific cases and then decide accordingly which operator to use. Let me know if that makes sense :) Yes, thank you for this clarification. Let's wait for benchmark results. And if it can be improved by implementing additional version of StreamingMultiJoinOperator (which would work only for JoinKey, not CommonJoinKey) then we will try to do it. BTW here you can find the PR to nexmark project with attempt of constructing and benchmarking multi way join - https://github.com/nexmark/nexmark/pull/70 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org