Re: [PR] [SPARK-42746][SQL] Implement LISTAGG function [spark]

via GitHub Fri, 15 Nov 2024 03:02:11 -0800


cloud-fan commented on code in PR #48748:
URL: https://github.com/apache/spark/pull/48748#discussion_r1843597510



##########
sql/core/src/test/resources/sql-tests/results/listagg.sql.out:
##########
@@ -0,0 +1,436 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+CREATE TEMP VIEW df AS
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL)) AS t(a, b)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMP VIEW df2 AS
+SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT listagg(b) FROM df GROUP BY a
+-- !query schema
+struct<listagg(b, NULL):string>
+-- !query output
+NULL
+bc
+cd
+
+
+-- !query
+SELECT string_agg(b) FROM df GROUP BY a
+-- !query schema
+struct<string_agg(b, NULL):string>
+-- !query output
+NULL
+bc
+cd
+
+
+-- !query
+SELECT listagg(b, NULL) FROM df GROUP BY a
+-- !query schema
+struct<listagg(b, NULL):string>
+-- !query output
+NULL
+bc
+cd
+
+
+-- !query
+SELECT listagg(b) FROM df WHERE 1 != 1
+-- !query schema
+struct<listagg(b, NULL):string>
+-- !query output
+NULL
+
+
+-- !query
+SELECT listagg(b, '|') FROM df GROUP BY a
+-- !query schema
+struct<listagg(b, |):string>
+-- !query output
+NULL
+b|c
+c|d
+
+
+-- !query
+SELECT listagg(a) FROM df
+-- !query schema
+struct<listagg(a, NULL):string>
+-- !query output
+aabb
+
+
+-- !query
+SELECT listagg(DISTINCT a) FROM df
+-- !query schema
+struct<listagg(DISTINCT a, NULL):string>
+-- !query output
+ab
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST):string>
+-- !query output
+aabb
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST):string>
+-- !query output
+bbaa
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER 
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING):string>
+-- !query output
+NULL
+a
+b
+ba
+ba
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST):string>
+-- !query output
+aabb
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+-- !query output
+baba
+
+
+-- !query
+SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df
+-- !query schema
+struct<listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+-- !query output
+b|a|b|a
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS 
FIRST):string>
+-- !query output
+baba
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df
+-- !query schema
+struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS 
LAST):string>
+-- !query output
+bbaa
+
+
+-- !query
+SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1)
+-- !query schema
+struct<listagg(c1, NULL):binary>
+-- !query output
+ޭ��
+
+
+-- !query
+SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1)
+-- !query schema
+struct<listagg(c1, NULL):binary>
+-- !query output
+ޭ��
+
+
+-- !query
+SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1)
+-- !query schema
+struct<listagg(c1, X'42'):binary>
+-- !query output
+ޭB��
+
+
+-- !query
+SELECT listagg(a), listagg(b, ',') FROM df2
+-- !query schema
+struct<listagg(a, NULL):string,listagg(b, ,):string>
+-- !query output
+123    true,false,false
+
+
+-- !query
+SELECT listagg(c1) FROM (VALUES (ARRAY['a', 'b'])) AS t(c1)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "PARSE_SYNTAX_ERROR",
+  "sqlState" : "42601",
+  "messageParameters" : {
+    "error" : "','",
+    "hint" : ""
+  }
+}
+
+
+-- !query
+SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.DATA_DIFF_TYPES",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "dataType" : "(\"BINARY\" or \"STRING\")",
+    "functionName" : "`listagg`",
+    "sqlExpr" : "\"listagg(c1, , )\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 24,
+    "fragment" : "listagg(c1, ', ')"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(b, a) FROM df GROUP BY a
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputExpr" : "\"a\"",
+    "inputName" : "`delimiter`",
+    "inputType" : "\"STRING\"",
+    "sqlExpr" : "\"listagg(b, a)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 20,
+    "fragment" : "listagg(b, a)"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(a) OVER (ORDER BY a) FROM df
+-- !query schema
+struct<listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 
UNBOUNDED PRECEDING AND CURRENT ROW):string>
+-- !query output
+NULL
+aa
+aa
+aabb
+aabb
+
+
+-- !query
+SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
+  "sqlState" : "42601",
+  "messageParameters" : {
+    "aggFunc" : "\"listagg(a, NULL, a)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 61,
+    "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+  } ]
+}
+
+
+-- !query
+SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
+  "sqlState" : "42601",
+  "messageParameters" : {
+    "aggFunc" : "\"listagg(a, NULL, a)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 64,
+    "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
+  "sqlState" : "0A000",
+  "messageParameters" : {
+    "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS 
FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 44,
+    "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "FUNCTION_AND_ORDER_EXPRESSION_MISMATCH",
+  "sqlState" : "42822",
+  "messageParameters" : {
+    "functionArgs" : "\"a\"",
+    "functionName" : "`listagg`",
+    "orderExpr" : "\"b\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 52,
+    "fragment" : "listagg(DISTINCT a) WITHIN GROUP (ORDER BY b)"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "FUNCTION_AND_ORDER_EXPRESSION_MISMATCH",
+  "sqlState" : "42822",
+  "messageParameters" : {
+    "functionArgs" : "\"a\"",
+    "functionName" : "`listagg`",
+    "orderExpr" : "\"a\", \"b\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 55,
+    "fragment" : "listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b)"
+  } ]
+}
+
+
+-- !query
+SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1)
+-- !query schema
+struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_binary) ASC 
NULLS FIRST):string>
+-- !query output
+ABab
+
+
+-- !query
+SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1)
+-- !query schema
+struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC 
NULLS FIRST):string>
+-- !query output
+aAbB
+
+
+-- !query
+SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+-- !query schema
+struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string>
+-- !query output
+aAbB
+
+
+-- !query
+SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+-- !query schema
+struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL):string collate 
UTF8_LCASE>
+-- !query output
+ab
+
+
+-- !query
+SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
+-- !query schema
+struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY 
collate(c1, utf8_lcase) ASC NULLS FIRST):string collate UTF8_LCASE>
+-- !query output
+aB
+
+
+-- !query
+SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "FUNCTION_AND_ORDER_EXPRESSION_MISMATCH",

Review Comment:
   so the contract is, `listagg(expr1) WITHIN GROUP (ORDER BY expr2)`, expr1 
and expr2 must be the same?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-42746][SQL] Implement LISTAGG function [spark]

Reply via email to