Re: [PR] [SPARK-55501][SQL] Fix listagg distinct + within group order by bug [spark]

via GitHub Wed, 18 Feb 2026 22:47:59 -0800


helioshe4 commented on code in PR #54297:
URL: https://github.com/apache/spark/pull/54297#discussion_r2825964524



##########
sql/core/src/test/resources/sql-tests/results/listagg.sql.out:
##########
@@ -189,6 +189,144 @@ struct<len(col1):int,regexp_count(col1, 
1):int,regexp_count(col1, 2):int,regexp_
 3      1       1       1       16      1       0
 
 
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), 
(2), (2), (3) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2,3
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(cast(1 as bigint)), (cast(2 as bigint)), (cast(2 as bigint)), (cast(3 as 
bigint)) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2,3
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(cast(1 as smallint)), (cast(2 as smallint)), (cast(2 as smallint)), (cast(3 as 
smallint)) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2,3
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(cast(1 as tinyint)), (cast(2 as tinyint)), (cast(2 as tinyint)), (cast(3 as 
tinyint)) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2,3
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(cast(1.10 as decimal(10,2))), (cast(2.20 as decimal(10,2))), (cast(2.20 as 
decimal(10,2))) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1.10,2.20
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(DATE'2024-01-01'), (DATE'2024-01-02'), (DATE'2024-01-01') AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+2024-01-01,2024-01-02
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(TIMESTAMP_NTZ'2024-01-01 10:00:00'), (TIMESTAMP_NTZ'2024-01-02 12:00:00'), 
(TIMESTAMP_NTZ'2024-01-01 10:00:00') AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+2024-01-01 10:00:00,2024-01-02 12:00:00
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(true), (false), (true) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+false,true
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES 
(INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '1' MONTH) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+INTERVAL '1' MONTH,INTERVAL '2' MONTH
+
+
+-- !query
+SELECT listagg(DISTINCT cast(col as string), ',') WITHIN GROUP (ORDER BY col) 
FROM VALUES (10), (1), (2), (20), (2) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT CAST(col AS STRING), ,) WITHIN GROUP (ORDER BY col ASC 
NULLS FIRST):string>
+-- !query output
+1,2,10,20
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col DESC) FROM VALUES 
(1), (10), (2), (20), (2) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col DESC NULLS 
LAST):string>
+-- !query output
+20,10,2,1
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), 
(2), (null), (2), (3) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2,3
+
+
+-- !query
+SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col NULLS FIRST) FROM 
VALUES (1), (null), (2), (null) AS t(col)
+-- !query schema
+struct<listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS 
FIRST):string>
+-- !query output
+1,2
+
+
+-- !query
+SELECT grp, listagg(DISTINCT col) WITHIN GROUP (ORDER BY col) FROM VALUES (1, 
'a'), (1, 'b'), (2, 'a'), (2, 'a'), (1, 'b') AS t(grp, col) GROUP BY grp
+-- !query schema
+struct<grp:int,listagg(DISTINCT col, NULL) WITHIN GROUP (ORDER BY col ASC 
NULLS FIRST):string>
+-- !query output
+1      ab
+2      a
+
+
+-- !query
+WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY 
col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT 
len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF'), 
regexp_count(col, X'CAFE') FROM t
+-- !query schema
+struct<len(col):int,regexp_count(col, X'DEAD'):int,regexp_count(col, 
X'BEEF'):int,regexp_count(col, X'CAFE'):int>
+-- !query output
+8      1       2       2

Review Comment:
   hm interesting
   
   the issue is that `regexp_count` takes `StringType` as input, so it was 
doing some weird cast on the binary to convert to string, and X'BEEF' and 
X'CAFE' are both invalid UTF8 decodings (which map to the same undefined string 
value "\uFFFD"), which is why they show up as count 2.  
   
   
https://github.com/apache/spark/blob/b976e7b9e058f9e36309008e4d9b14fc085f9773/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L1091-L1092
   
   
   The previous existing tests in listagg.sql print the expected result because 
those values decoded to valid (but meaningless and different) strings.  The fix 
is to wrap the binary with a hex(), which can properly decode the binary to 
string.  I'll change all of them and regenerate the files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-55501][SQL] Fix listagg distinct + within group order by bug [spark]

Reply via email to