This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 79e1481d43 [core] Fix FieldListaggAgg distinct mode incorrectly
skipping values (#7652)
79e1481d43 is described below
commit 79e1481d43ecdd2f6a573f0971a1df62495b8751
Author: lszskye <[email protected]>
AuthorDate: Thu Apr 16 09:00:22 2026 +0800
[core] Fix FieldListaggAgg distinct mode incorrectly skipping values (#7652)
Fix a bug in `FieldListaggAgg.agg()` where the distinct deduplication
logic
incorrectly uses substring matching (`BinaryString.contains()`) instead
of
exact token matching, causing valid values to be silently dropped.
For example, with delimiter `,`:
- accumulator = `"abc,def,asd"`
- inputField = `"ab,xy"`
- Token `"ab"` is incorrectly skipped because
`"abc,def,asd".contains("ab")`
returns `true`
- Result: `"abc,def,asd,xy"` (missing `"ab"`)
- Expected: `"abc,def,asd,ab,xy"`
---
.../compact/aggregate/FieldListaggAgg.java | 18 +++++++---
.../mergetree/compact/MergeFunctionTestUtils.java | 10 +++---
.../compact/aggregate/FieldAggregatorTest.java | 40 ++++++++++++++++++++++
3 files changed, 60 insertions(+), 8 deletions(-)
diff --git
a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldListaggAgg.java
b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldListaggAgg.java
index cdd3901f24..6ca31bbead 100644
---
a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldListaggAgg.java
+++
b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldListaggAgg.java
@@ -24,7 +24,9 @@ import org.apache.paimon.types.VarCharType;
import org.apache.paimon.utils.BinaryStringUtils;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import static
org.apache.paimon.utils.BinaryStringUtils.splitByWholeSeparatorPreserveAllTokens;
@@ -35,11 +37,14 @@ public class FieldListaggAgg extends FieldAggregator {
private final String delimiter;
+ private final BinaryString delimiterBinaryString;
+
private final boolean distinct;
public FieldListaggAgg(String name, VarCharType dataType, CoreOptions
options, String field) {
super(name, dataType);
this.delimiter = options.fieldListAggDelimiter(field);
+ this.delimiterBinaryString = BinaryString.fromString(this.delimiter);
this.distinct = options.fieldCollectAggDistinct(field);
}
@@ -62,16 +67,22 @@ public class FieldListaggAgg extends FieldAggregator {
}
if (distinct) {
- BinaryString delimiterBinaryString =
BinaryString.fromString(delimiter);
+ BinaryString[] accumulatorTokens =
+ splitByWholeSeparatorPreserveAllTokens(mergeFieldSD,
delimiterBinaryString);
+ Set<BinaryString> existingTokens = new
HashSet<>(accumulatorTokens.length);
+ for (BinaryString token : accumulatorTokens) {
+ existingTokens.add(token);
+ }
List<BinaryString> result = new ArrayList<>();
result.add(mergeFieldSD);
for (BinaryString str :
splitByWholeSeparatorPreserveAllTokens(inFieldSD,
delimiterBinaryString)) {
- if (str.getSizeInBytes() == 0 || mergeFieldSD.contains(str)) {
+ if (str.getSizeInBytes() == 0 || existingTokens.contains(str))
{
continue;
}
+ existingTokens.add(str);
result.add(delimiterBinaryString);
result.add(str);
}
@@ -83,7 +94,6 @@ public class FieldListaggAgg extends FieldAggregator {
return BinaryStringUtils.concat(result);
}
- return BinaryStringUtils.concat(
- mergeFieldSD, BinaryString.fromString(delimiter), inFieldSD);
+ return BinaryStringUtils.concat(mergeFieldSD, delimiterBinaryString,
inFieldSD);
}
}
diff --git
a/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/MergeFunctionTestUtils.java
b/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/MergeFunctionTestUtils.java
index 8035985969..27ae5cce5d 100644
---
a/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/MergeFunctionTestUtils.java
+++
b/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/MergeFunctionTestUtils.java
@@ -66,10 +66,12 @@ public class MergeFunctionTestUtils {
expected.add(group.get(group.size() - 1));
} else {
if (group.stream().noneMatch(data -> data.valueKind ==
RowKind.INSERT)) {
- // No insert: fill the pk and left nullable fields to
null; sequenceNumber = 0
- // because it's not updated
- ReusingTestData last = group.get(group.size() - 1);
- expected.add(new ReusingTestData(last.key, 0,
RowKind.DELETE, null));
+ // No insert with ignoreDelete: initRow sets all fields
(including
+ // nullable ones) from the first DELETE record, then all
DELETEs are
+ // ignored. sequenceNumber stays 0 because
latestSequenceNumber is
+ // updated after the ignoreDelete check.
+ ReusingTestData first = group.get(0);
+ expected.add(new ReusingTestData(first.key, 0,
RowKind.DELETE, first.value));
} else {
// get the last INSERT data because later DELETE data are
ignored
group.stream()
diff --git
a/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/aggregate/FieldAggregatorTest.java
b/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/aggregate/FieldAggregatorTest.java
index 7e9a3aaf25..f040a5eb73 100644
---
a/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/aggregate/FieldAggregatorTest.java
+++
b/paimon-core/src/test/java/org/apache/paimon/mergetree/compact/aggregate/FieldAggregatorTest.java
@@ -356,6 +356,46 @@ public class FieldAggregatorTest {
.isEqualTo("user1-user2");
}
+ @Test
+ public void testFieldListAggDistinctShouldNotMatchSubstring() {
+ FieldListaggAgg fieldListaggAgg =
+ new FieldListaggAggFactory()
+ .create(
+ new VarCharType(VarCharType.MAX_LENGTH),
+ CoreOptions.fromMap(
+
ImmutableMap.of("fields.fieldName.distinct", "true")),
+ "fieldName");
+
+ BinaryString accumulator = BinaryString.fromString("abc,def,asd");
+ BinaryString inputField = BinaryString.fromString("ab,xy");
+ Object result = fieldListaggAgg.agg(accumulator, inputField);
+
+ assertNotNull(result);
+ assertEquals("abc,def,asd,ab,xy", result.toString());
+ }
+
+ @Test
+ public void testFieldListAggDistinctSubstringWithCustomDelimiter() {
+ FieldListaggAgg fieldListaggAgg =
+ new FieldListaggAggFactory()
+ .create(
+ new VarCharType(VarCharType.MAX_LENGTH),
+ CoreOptions.fromMap(
+ ImmutableMap.of(
+ "fields.fieldName.distinct",
+ "true",
+
"fields.fieldName.list-agg-delimiter",
+ ";")),
+ "fieldName");
+
+ BinaryString accumulator = BinaryString.fromString("abc;def;asd");
+ BinaryString inputField = BinaryString.fromString("ab;xy;def");
+ Object result = fieldListaggAgg.agg(accumulator, inputField);
+
+ assertNotNull(result);
+ assertEquals("abc;def;asd;ab;xy", result.toString());
+ }
+
@Test
public void testFieldListAggWithBoundedVarcharShouldFail() {
FieldListaggAggFactory factory = new FieldListaggAggFactory();