Hans-Raintree commented on issue #13827:
URL: https://github.com/apache/hudi/issues/13827#issuecomment-3248511721
This is one way I tested in spark:
```
PATH = "/data/hudi_mor_test"
spark = (
SparkSession.builder
.appName("hudi-incremental-repro")
.config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
.config("spark.sql.extensions","org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.hudi.catalog.HoodieCatalog")
.getOrCreate()
)
def upsert(rows):
opts = {
"hoodie.table.name":"t_bug",
"hoodie.write.concurrency.mode": "NON_BLOCKING_CONCURRENCY_CONTROL",
"hoodie.table.type":"MERGE_ON_READ",
"hoodie.datasource.write.table.type":"MERGE_ON_READ",
"hoodie.datasource.write.recordkey.field":"id",
"hoodie.datasource.write.precombine.field":"ts",
"hoodie.datasource.write.partitionpath.field":"city",
"hoodie.index.type":"BUCKET",
"hoodie.write.lock.provider":
"org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider",
"hoodie.index.bucket.engine":"SIMPLE",
"hoodie.bucket.index.num.buckets":"1",
"hoodie.metadata.enable": "false",
"hoodie.clean.failed.writes.policy": "LAZY"
}
df = spark.createDataFrame([Row(**r) for r in rows])
(df.write
.format("hudi")
.option('hoodie.datasource.write.operation', 'upsert')
.options(**opts)
.mode("append")
.save(PATH)
)
for i in range(1, 3):
upsert(
[
{"id": "id-1", "ts": i, "rider": f"r1{i}", "city": "la"},
{"id": "id-1", "ts": i, "rider": f"r2{i}", "city": "ny"},
{"id": "id-1", "ts": i, "rider": f"r3{i}", "city": "sf"},
]
)
snap = spark.read.format("hudi").load(PATH)
incr = (spark.read.format("hudi")
.option("hoodie.datasource.query.type","incremental")
.option("hoodie.datasource.read.begin.instanttime","000")
.option("hoodie.datasource.read.incr.fallback.fulltablescan.enable",
"false")
.load(PATH))
snap.show(truncate=False)
incr.show(truncate=False)
spark.stop()
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|id |ts |rider|city|
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
|20250903093733351 |20250903093733351_0_4|id-1 |sf
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r32 |sf |
|20250903093733351 |20250903093733351_2_6|id-1 |ny
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r22 |ny |
|20250903093733351 |20250903093733351_1_5|id-1 |la
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r12 |la |
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|id |ts |rider|city|
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
|20250903093733351 |20250903093733351_1_5|id-1 |la
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r12 |la |
|20250903093733351 |20250903093733351_2_6|id-1 |ny
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r22 |ny |
|20250903093733351 |20250903093733351_0_4|id-1 |sf
|00000000-0000-0000-0000-000000000000-0|id-1|2 |r32 |sf |
+-------------------+---------------------+------------------+----------------------+--------------------------------------+----+---+-----+----+
la\.00000000-0000-0000-0000-000000000000-0_20250903093726835.log.1_1-5-97
la\.00000000-0000-0000-0000-000000000000-0_20250903093733351.log.1_1-15-201
ny\.00000000-0000-0000-0000-000000000000-0_20250903093726835.log.1_2-5-98
ny\.00000000-0000-0000-0000-000000000000-0_20250903093733351.log.1_2-15-202
sf\.00000000-0000-0000-0000-000000000000-0_20250903093726835.log.1_0-5-96
sf\.00000000-0000-0000-0000-000000000000-0_20250903093733351.log.1_0-15-200
Wasn't able to reproduce in Spark, note how the file extensions are unique
though, so might be related to that?
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]