This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch release-0.13.0
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 78a2c647ceb6122738f68898d4fa80f6e8b472c2
Author: Alexey Kudinkin <[email protected]>
AuthorDate: Fri Feb 3 12:57:00 2023 -0800

    [MINOR] Fixing CTAS configuration not propagated properly  (#7832)
    
    This change addresses issue of CTAS erroneously performing de-duplication, 
due to the fact that
    
     - It reuses `InsenrtIntoHoodieTableCommand` infrastructure to insert the 
data
     - It provides an extra-options overriding default configs (tuned for 
Insert Into statement) specifically for CTAS
     - Extra options not being merged into the final config (at some point 
after 0.12.2), resulted into it by default doing de-duplication
---
 .../spark/sql/hudi/ProvidesHoodieConfig.scala      |  2 +-
 .../apache/spark/sql/hudi/TestCreateTable.scala    | 28 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
index c8f01a12623..0c766f5135b 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala
@@ -186,7 +186,7 @@ trait ProvidesHoodieConfig extends Logging {
       HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> 
String.valueOf(hasPrecombineColumn)
     )
 
-    val overridingOpts = Map(
+    val overridingOpts = extraOptions ++ Map(
       "path" -> path,
       TABLE_TYPE.key -> tableType,
       TBL_NAME.key -> hoodieCatalogTable.tableName,
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
index 120c6adb6cd..47a8054252a 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.types._
 import org.junit.jupiter.api.Assertions.assertFalse
 
 import scala.collection.JavaConverters._
+import scala.collection.Seq
 
 class TestCreateTable extends HoodieSparkSqlTestBase {
 
@@ -1036,4 +1037,31 @@ class TestCreateTable extends HoodieSparkSqlTestBase {
     checkKeyGenerator("org.apache.hudi.keygen.ComplexKeyGenerator", tableName)
     spark.sql(s"drop table $tableName")
   }
+
+  test("Test CTAS not de-duplicating (by default)") {
+    withRecordType() {
+      withTempDir { tmp =>
+        val tableName = generateTableName
+        spark.sql(
+          s"""
+             |CREATE TABLE $tableName USING hudi
+             | LOCATION '${tmp.getCanonicalPath}/$tableName'
+             | TBLPROPERTIES (
+             |  primaryKey = 'id',
+             |  preCombineField = 'ts'
+             | )
+             | AS SELECT * FROM (
+             |  SELECT 1 as id, 'a1' as name, 10 as price, 1000 as ts
+             |  UNION ALL
+             |  SELECT 1 as id, 'a1' as name, 11 as price, 1001 as ts
+             | )
+       """.stripMargin)
+
+        checkAnswer(s"select id, name, price, ts from $tableName")(
+          Seq(1, "a1", 10.0, 1000),
+          Seq(1, "a1", 11.0, 1001)
+        )
+      }
+    }
+  }
 }

Reply via email to