This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch release-0.13.0 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit 78a2c647ceb6122738f68898d4fa80f6e8b472c2 Author: Alexey Kudinkin <[email protected]> AuthorDate: Fri Feb 3 12:57:00 2023 -0800 [MINOR] Fixing CTAS configuration not propagated properly (#7832) This change addresses issue of CTAS erroneously performing de-duplication, due to the fact that - It reuses `InsenrtIntoHoodieTableCommand` infrastructure to insert the data - It provides an extra-options overriding default configs (tuned for Insert Into statement) specifically for CTAS - Extra options not being merged into the final config (at some point after 0.12.2), resulted into it by default doing de-duplication --- .../spark/sql/hudi/ProvidesHoodieConfig.scala | 2 +- .../apache/spark/sql/hudi/TestCreateTable.scala | 28 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala index c8f01a12623..0c766f5135b 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala @@ -186,7 +186,7 @@ trait ProvidesHoodieConfig extends Logging { HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn) ) - val overridingOpts = Map( + val overridingOpts = extraOptions ++ Map( "path" -> path, TABLE_TYPE.key -> tableType, TBL_NAME.key -> hoodieCatalogTable.tableName, diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala index 120c6adb6cd..47a8054252a 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.assertFalse import scala.collection.JavaConverters._ +import scala.collection.Seq class TestCreateTable extends HoodieSparkSqlTestBase { @@ -1036,4 +1037,31 @@ class TestCreateTable extends HoodieSparkSqlTestBase { checkKeyGenerator("org.apache.hudi.keygen.ComplexKeyGenerator", tableName) spark.sql(s"drop table $tableName") } + + test("Test CTAS not de-duplicating (by default)") { + withRecordType() { + withTempDir { tmp => + val tableName = generateTableName + spark.sql( + s""" + |CREATE TABLE $tableName USING hudi + | LOCATION '${tmp.getCanonicalPath}/$tableName' + | TBLPROPERTIES ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + | AS SELECT * FROM ( + | SELECT 1 as id, 'a1' as name, 10 as price, 1000 as ts + | UNION ALL + | SELECT 1 as id, 'a1' as name, 11 as price, 1001 as ts + | ) + """.stripMargin) + + checkAnswer(s"select id, name, price, ts from $tableName")( + Seq(1, "a1", 10.0, 1000), + Seq(1, "a1", 11.0, 1001) + ) + } + } + } }
