zhengruifeng commented on code in PR #51937:
URL: https://github.com/apache/spark/pull/51937#discussion_r2268437700
##########
python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py:
##########
@@ -509,6 +509,62 @@ def iter_func(it):
actual = df.select(struct_f(struct(col("id"),
col("id").cast("string").alias("str"))))
self.assertEqual(expected, actual.collect())
+ def test_vectorized_udf_struct_missing_field(self):
+ df = self.spark.range(10)
+ return_type = StructType([StructField("id", LongType()),
StructField("str", StringType())])
+
+ def scalar_func(id: int) -> pd.DataFrame:
+ return pd.DataFrame({"id": id})
+
+ f = pandas_udf(scalar_func, returnType=return_type,
functionType=PandasUDFType.SCALAR)
+ # before fix, exception was: KeyError: 'str'
+ with self.assertRaisesRegex(
+ PythonException,
+ "Column names of the returned pandas.DataFrame do not match
specified schema. "
+ "Missing: str.\n",
+ ):
+ df.select(f(col("id")).alias("struct")).collect()
+
+ def test_vectorized_udf_struct_unexpected_field(self):
+ df = self.spark.range(10)
+ return_type = StructType([StructField("id", LongType()),
StructField("str", StringType())])
+
+ def scalar_func(id: int) -> pd.DataFrame:
+ return pd.DataFrame({"id": id, "str2": id.apply(str), "str":
id.apply(str)})
+
+ f = pandas_udf(scalar_func, returnType=return_type,
functionType=PandasUDFType.SCALAR)
+ # before fix, silently succeeds. Now raises an exception.
+ # note that because we truncate fields in returned pd.DataFrame
+ # the 'str' column will appear to be missing to validation logic
+ with self.assertRaisesRegex(
+ PythonException,
+ "Column names of the returned pandas.DataFrame do not match
specified schema. "
+ "Missing: str. Unexpected: str2.\n",
+ ):
+ df.select(f(col("id")).alias("struct")).collect()
+
+ def test_vectorized_udf_struct_duplicate_field(self):
Review Comment:
cc @ueshin
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]