sunxiaoguang commented on code in PR #49453: URL: https://github.com/apache/spark/pull/49453#discussion_r1929552100
########## connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala: ########## @@ -241,6 +241,61 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest assert(rows10(0).getString(0) === "amy") assert(rows10(1).getString(0) === "alex") } + + test("SPARK-50793: MySQL JDBC Connector failed to cast some types") { + val tableName = catalogName + ".test_cast_function" + withTable(tableName) { + val stringValue = "0" + val stringLiteral = "'0'" + val longValue = 0L + val binaryValue = Array[Byte](0x30) + val binaryLiteral = "x'30'" + val doubleValue = 0.0 + val doubleLiteral = "0.0" + // CREATE table to use types defined in Spark SQL + sql( + s"CREATE TABLE $tableName (string_col STRING, long_col LONG, " + + "binary_col BINARY, double_col DOUBLE)") + sql( + s"INSERT INTO $tableName VALUES($stringLiteral, $longValue, $binaryLiteral, $doubleValue)") + + def testCast( + castType: String, + sourceCol: String, + targetCol: String, + targetDataType: DataType, + targetValue: Any): Unit = { + val sql = s"SELECT CAST($sourceCol AS $castType) AS target " + + s"FROM $tableName WHERE CAST($sourceCol AS $castType) = $targetCol" + val df = spark.sql(sql) + castType match { + case "SHORT" | "INTEGER" => + checkError( + exception = intercept[SparkException] { + df.collect() + }, + condition = null) + case _ => + checkFilterPushed(df) + checkAnswer(df, Seq(Row(targetValue))) + val expectedTypes = Array(targetDataType) + val resultTypes = df.schema.fields.map(_.dataType) + assert(resultTypes === expectedTypes, s"Failed to cast $sourceCol to $castType") + } + } + + testCast("BINARY", "string_col", "binary_col", BinaryType, binaryValue); + testCast("SHORT", "string_col", "long_col", LongType, longValue) + testCast("INTEGER", "string_col", "long_col", LongType, longValue) + testCast("LONG", "string_col", "long_col", LongType, longValue) + // We use stringLiteral to make sure both values are using the same collation + testCast("STRING", "long_col", stringLiteral, StringType, stringValue) Review Comment: Since the test runs with both `MariaDB Connector/J 2.7.12` and `MySQL Connector/J 9.1.0`, we need this to ensure consistent behavior in the test. There are many details involved. Simply put, we don't want to change the environment or the behavior of the test setup, so we need to make the new test work with the existing test setup while still validating the test cases. The following details are provided for everyone's reference. The reason they are incompatible is that, for the test setup, `MariaDB connector/j 2.7.12` uses `utf8mb4_unicode_ci` for `collation_connection`, whereas MySQL Connector/J 9.1.0 uses utf8mb4_0900_ai_ci for `collation_connection`. The test would fail without using `stringLiteral` here, as shown below. <img width="1920" alt="image" src="https://github.com/user-attachments/assets/6e4454b5-ffe0-483e-b506-8a5f6201719b" /> [The implementation for deciding](https://github.com/mariadb-corporation/mariadb-connector-j/blob/175b47fa386c3a4acac827ae0a00214ef812c297/src/main/java/org/mariadb/jdbc/internal/protocol/AbstractConnectProtocol.java#L1229) `connection_collation` in MariaDB Connector/J 2.7.12 is as follows: the `serverLanguage` is set to 255 for the test setup of MySQL Server 9.1.0; therefore, the chosen collation is 224, which corresponds to utf8mb4_unicode_ci. ```java /** * Default collation used for string exchanges with server. * * @param serverLanguage server default collation * @return collation byte */ private byte decideLanguage(int serverLanguage) { // return current server utf8mb4 collation if (serverLanguage == 45 // utf8mb4_general_ci || serverLanguage == 46 // utf8mb4_bin || (serverLanguage >= 224 && serverLanguage <= 247)) { return (byte) serverLanguage; } if (getMajorServerVersion() == 5 && getMinorServerVersion() <= 1) { // 5.1 version doesn't know 4 bytes utf8 return (byte) 33; // utf8_general_ci } // if server language is utf8mb3, use utf8mb4 equivalent collation if (serverLanguage == 33) { // utf8mb4_general_ci return 45; } if (serverLanguage == 83) { // utf8mb4_bin return 46; } if (serverLanguage >= 192 && serverLanguage <= 215) { // equivalent utf8mb4 collation return (byte) (serverLanguage + 32); } return (byte) 224; // UTF8MB4_UNICODE_CI; } ``` On the other hand, [the implementation of](https://github.com/mysql/mysql-connector-j/blob/cf2917ea44ae2e43a4514a33771035aa99de73bf/src/main/core-api/java/com/mysql/cj/CharsetMapping.java#L395) `MySQL Connector/J 9.1.0` utilizes a lookup table. It uses the collation `utf8mb4_0900_ai_ci`, which is the same as `collation_server` and `collation_database` in the test setup. ```java // 216..222 collation[223] = new Collation(223, new String[] { "utf8mb3_general_mysql500_ci", "utf8_general_mysql500_ci" }, 0, MYSQL_CHARSET_NAME_utf8mb3); collation[224] = new Collation(224, "utf8mb4_unicode_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[225] = new Collation(225, "utf8mb4_icelandic_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[226] = new Collation(226, "utf8mb4_latvian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[227] = new Collation(227, "utf8mb4_romanian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[228] = new Collation(228, "utf8mb4_slovenian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[229] = new Collation(229, "utf8mb4_polish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[230] = new Collation(230, "utf8mb4_estonian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[231] = new Collation(231, "utf8mb4_spanish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[232] = new Collation(232, "utf8mb4_swedish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[233] = new Collation(233, "utf8mb4_turkish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[234] = new Collation(234, "utf8mb4_czech_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[235] = new Collation(235, "utf8mb4_danish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[236] = new Collation(236, "utf8mb4_lithuanian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[237] = new Collation(237, "utf8mb4_slovak_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[238] = new Collation(238, "utf8mb4_spanish2_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[239] = new Collation(239, "utf8mb4_roman_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[240] = new Collation(240, "utf8mb4_persian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[241] = new Collation(241, "utf8mb4_esperanto_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[242] = new Collation(242, "utf8mb4_hungarian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[243] = new Collation(243, "utf8mb4_sinhala_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[244] = new Collation(244, "utf8mb4_german2_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[245] = new Collation(245, "utf8mb4_croatian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[246] = new Collation(246, "utf8mb4_unicode_520_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[247] = new Collation(247, "utf8mb4_vietnamese_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[248] = new Collation(248, "gb18030_chinese_ci", 1, MYSQL_CHARSET_NAME_gb18030); collation[249] = new Collation(249, "gb18030_bin", 0, MYSQL_CHARSET_NAME_gb18030); collation[250] = new Collation(250, "gb18030_unicode_520_ci", 0, MYSQL_CHARSET_NAME_gb18030); // 251..254 collation[255] = new Collation(255, "utf8mb4_0900_ai_ci", 1, MYSQL_CHARSET_NAME_utf8mb4); collation[256] = new Collation(256, "utf8mb4_de_pb_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[257] = new Collation(257, "utf8mb4_is_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[258] = new Collation(258, "utf8mb4_lv_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[259] = new Collation(259, "utf8mb4_ro_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[260] = new Collation(260, "utf8mb4_sl_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[261] = new Collation(261, "utf8mb4_pl_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[262] = new Collation(262, "utf8mb4_et_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[263] = new Collation(263, "utf8mb4_es_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[264] = new Collation(264, "utf8mb4_sv_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[265] = new Collation(265, "utf8mb4_tr_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[266] = new Collation(266, "utf8mb4_cs_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[267] = new Collation(267, "utf8mb4_da_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[268] = new Collation(268, "utf8mb4_lt_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[269] = new Collation(269, "utf8mb4_sk_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[270] = new Collation(270, "utf8mb4_es_trad_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[271] = new Collation(271, "utf8mb4_la_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org