sunxiaoguang commented on code in PR #49453: URL: https://github.com/apache/spark/pull/49453#discussion_r1929552100
########## connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala: ########## @@ -241,6 +241,61 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest assert(rows10(0).getString(0) === "amy") assert(rows10(1).getString(0) === "alex") } + + test("SPARK-50793: MySQL JDBC Connector failed to cast some types") { + val tableName = catalogName + ".test_cast_function" + withTable(tableName) { + val stringValue = "0" + val stringLiteral = "'0'" + val longValue = 0L + val binaryValue = Array[Byte](0x30) + val binaryLiteral = "x'30'" + val doubleValue = 0.0 + val doubleLiteral = "0.0" + // CREATE table to use types defined in Spark SQL + sql( + s"CREATE TABLE $tableName (string_col STRING, long_col LONG, " + + "binary_col BINARY, double_col DOUBLE)") + sql( + s"INSERT INTO $tableName VALUES($stringLiteral, $longValue, $binaryLiteral, $doubleValue)") + + def testCast( + castType: String, + sourceCol: String, + targetCol: String, + targetDataType: DataType, + targetValue: Any): Unit = { + val sql = s"SELECT CAST($sourceCol AS $castType) AS target " + + s"FROM $tableName WHERE CAST($sourceCol AS $castType) = $targetCol" + val df = spark.sql(sql) + castType match { + case "SHORT" | "INTEGER" => + checkError( + exception = intercept[SparkException] { + df.collect() + }, + condition = null) + case _ => + checkFilterPushed(df) + checkAnswer(df, Seq(Row(targetValue))) + val expectedTypes = Array(targetDataType) + val resultTypes = df.schema.fields.map(_.dataType) + assert(resultTypes === expectedTypes, s"Failed to cast $sourceCol to $castType") + } + } + + testCast("BINARY", "string_col", "binary_col", BinaryType, binaryValue); + testCast("SHORT", "string_col", "long_col", LongType, longValue) + testCast("INTEGER", "string_col", "long_col", LongType, longValue) + testCast("LONG", "string_col", "long_col", LongType, longValue) + // We use stringLiteral to make sure both values are using the same collation + testCast("STRING", "long_col", stringLiteral, StringType, stringValue) Review Comment: Since the test runs with both `MariaDB connector/j 2.7.12` and `MySQL connector/j 9.1.0` we need this to make the behavior of test consistent. It's a lot of detail. Simply put, we don't want to change the environment and changing the behavior of test setup, so we need to make the new test works with existing test setup while still validating the test cases. And the following is the detail for everyone's reference. The reason they are incompatible is that for the test setup, `MariaDB connector/j 2.7.12` uses `utf8mb4_unicode_ci` for `collation_connection`, on the other hand `MySQL connector/j 9.1.0` uses `utf8mb4_0900_ai_ci` for `collation_connection`. The test would fail without using `stringLiteral` at here like this. <img width="1920" alt="image" src="https://github.com/user-attachments/assets/6e4454b5-ffe0-483e-b506-8a5f6201719b" /> [The implementation to decide](https://github.com/mariadb-corporation/mariadb-connector-j/blob/175b47fa386c3a4acac827ae0a00214ef812c297/src/main/java/org/mariadb/jdbc/internal/protocol/AbstractConnectProtocol.java#L1229) `connection_collation` in `MariaDB connector/j 2.7.12` is like this. The serverLanguage is 255 for test setup of `MySQL server 9.1.0`, therefore the collation chosen is 224 which is `utf8mb4_unicode_ci` ```java /** * Default collation used for string exchanges with server. * * @param serverLanguage server default collation * @return collation byte */ private byte decideLanguage(int serverLanguage) { // return current server utf8mb4 collation if (serverLanguage == 45 // utf8mb4_general_ci || serverLanguage == 46 // utf8mb4_bin || (serverLanguage >= 224 && serverLanguage <= 247)) { return (byte) serverLanguage; } if (getMajorServerVersion() == 5 && getMinorServerVersion() <= 1) { // 5.1 version doesn't know 4 bytes utf8 return (byte) 33; // utf8_general_ci } // if server language is utf8mb3, use utf8mb4 equivalent collation if (serverLanguage == 33) { // utf8mb4_general_ci return 45; } if (serverLanguage == 83) { // utf8mb4_bin return 46; } if (serverLanguage >= 192 && serverLanguage <= 215) { // equivalent utf8mb4 collation return (byte) (serverLanguage + 32); } return (byte) 224; // UTF8MB4_UNICODE_CI; } ``` On the other hand, [the implementation of](https://github.com/mysql/mysql-connector-j/blob/cf2917ea44ae2e43a4514a33771035aa99de73bf/src/main/core-api/java/com/mysql/cj/CharsetMapping.java#L395) `MySQL connector/j 9.1.0` is a lookup table like this. It uses the collation `utf8mb4_0900_ai_ci` which is the same as `collation_server` and `collation_database` with the test setup. ```java // 216..222 collation[223] = new Collation(223, new String[] { "utf8mb3_general_mysql500_ci", "utf8_general_mysql500_ci" }, 0, MYSQL_CHARSET_NAME_utf8mb3); collation[224] = new Collation(224, "utf8mb4_unicode_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[225] = new Collation(225, "utf8mb4_icelandic_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[226] = new Collation(226, "utf8mb4_latvian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[227] = new Collation(227, "utf8mb4_romanian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[228] = new Collation(228, "utf8mb4_slovenian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[229] = new Collation(229, "utf8mb4_polish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[230] = new Collation(230, "utf8mb4_estonian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[231] = new Collation(231, "utf8mb4_spanish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[232] = new Collation(232, "utf8mb4_swedish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[233] = new Collation(233, "utf8mb4_turkish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[234] = new Collation(234, "utf8mb4_czech_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[235] = new Collation(235, "utf8mb4_danish_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[236] = new Collation(236, "utf8mb4_lithuanian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[237] = new Collation(237, "utf8mb4_slovak_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[238] = new Collation(238, "utf8mb4_spanish2_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[239] = new Collation(239, "utf8mb4_roman_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[240] = new Collation(240, "utf8mb4_persian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[241] = new Collation(241, "utf8mb4_esperanto_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[242] = new Collation(242, "utf8mb4_hungarian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[243] = new Collation(243, "utf8mb4_sinhala_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[244] = new Collation(244, "utf8mb4_german2_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[245] = new Collation(245, "utf8mb4_croatian_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[246] = new Collation(246, "utf8mb4_unicode_520_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[247] = new Collation(247, "utf8mb4_vietnamese_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[248] = new Collation(248, "gb18030_chinese_ci", 1, MYSQL_CHARSET_NAME_gb18030); collation[249] = new Collation(249, "gb18030_bin", 0, MYSQL_CHARSET_NAME_gb18030); collation[250] = new Collation(250, "gb18030_unicode_520_ci", 0, MYSQL_CHARSET_NAME_gb18030); // 251..254 collation[255] = new Collation(255, "utf8mb4_0900_ai_ci", 1, MYSQL_CHARSET_NAME_utf8mb4); collation[256] = new Collation(256, "utf8mb4_de_pb_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[257] = new Collation(257, "utf8mb4_is_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[258] = new Collation(258, "utf8mb4_lv_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[259] = new Collation(259, "utf8mb4_ro_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[260] = new Collation(260, "utf8mb4_sl_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[261] = new Collation(261, "utf8mb4_pl_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[262] = new Collation(262, "utf8mb4_et_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[263] = new Collation(263, "utf8mb4_es_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[264] = new Collation(264, "utf8mb4_sv_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[265] = new Collation(265, "utf8mb4_tr_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[266] = new Collation(266, "utf8mb4_cs_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[267] = new Collation(267, "utf8mb4_da_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[268] = new Collation(268, "utf8mb4_lt_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[269] = new Collation(269, "utf8mb4_sk_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[270] = new Collation(270, "utf8mb4_es_trad_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); collation[271] = new Collation(271, "utf8mb4_la_0900_ai_ci", 0, MYSQL_CHARSET_NAME_utf8mb4); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org