[
https://issues.apache.org/jira/browse/HUDI-7276?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17886916#comment-17886916
]
Lin Liu commented on HUDI-7276:
-------------------------------
Without logs, the query could succeed.
{code:java}
scala> sdf.write.format("hudi").
| option("hoodie.table.name", "test_table").
| option("hoodie.datasource.write.recordkey.field", "key").
| option("hoodie.datasource.write.table.type", "MERGE_ON_READ").
| option("hoodie.datasource.read.use.new.parquet.file.format", "true").
| option("hoodie.file.group.reader.enabled", "true").
| option("hoodie.datasource.write.operation", "upsert").
| option("hoodie.merge.small.file.group.candidates.limit", "0").
| mode("Append").
| save(basePath)
24/10/04 13:14:08 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:14:09 WARN HoodieTableMetadata: Falling back to
FileSystemBackedTableMetadata as metadata table is not initialized
24/10/04 13:14:09 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:14:09 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:14:13 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:15:11 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:15:12 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:15:16 WARN ConfigUtils: The configuration key
'hoodie.cleaner.policy.failed.writes' has been deprecated and may be removed in
the future. Please use the new key 'hoodie.clean.failed.writes.policy' instead.
24/10/04 13:15:18 WARN HoodieSparkSqlWriterInternal: Closing write client
scala>
scala> val basePath =
"s3a://lokesh-testing/write-benchmark/write-bench/avro_second_no_logs"
basePath: String =
s3a://lokesh-testing/write-benchmark/write-bench/avro_second_no_logs
scala> val rsdf = spark.read.
| option("hoodie.file.group.reader.enabled", "true").
| format("hudi").load(basePath)
rsdf: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 12 more fields]
scala> rsdf.createOrReplaceTempView("my_table")
scala> spark.sql("SELECT * FROM my_table LIMIT 1").show(false)
{"type":"record","name":"test_table_record","namespace":"hoodie.test_table","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"default":null},{"name":"_hoodie_record_key","type":["null","string"],"default":null},{"name":"_hoodie_partition_path","type":["null","string"],"default":null},{"name":"_hoodie_file_name","type":["null","string"],"default":null},{"name":"key","type":["null","string"],"default":null},{"name":"partition","type":["null","string"],"default":null},{"name":"ts","type":["null","long"],"default":null},{"name":"textField","type":["null","string"],"default":null},{"name":"decimalField","type":["null","float"],"default":null},{"name":"longField","type":["null","long"],"default":null},{"name":"arrayField","type":["null",{"type":"array","items":["null","int"]}],"default":null},{"name":"mapField","type":["null",{"type":"map","values":["null","int"]}],"default":null},{"name":"round","type":["null","int"],"default":null}]}
{"type":"record","name":"test_table_record","namespace":"hoodie.test_table","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"default":null},{"name":"_hoodie_record_key","type":["null","string"],"default":null},{"name":"_hoodie_partition_path","type":["null","string"],"default":null},{"name":"_hoodie_file_name","type":["null","string"],"default":null},{"name":"key","type":["null","string"],"default":null},{"name":"partition","type":["null","string"],"default":null},{"name":"ts","type":["null","long"],"default":null},{"name":"textField","type":["null","string"],"default":null},{"name":"decimalField","type":["null","float"],"default":null},{"name":"longField","type":["null","long"],"default":null},{"name":"arrayField","type":["null",{"type":"array","items":["null","int"]}],"default":null},{"name":"mapField","type":["null",{"type":"map","values":["null","int"]}],"default":null},{"name":"round","type":["null","int"],"default":null}]}
{"type":"record","name":"test_table_record","namespace":"hoodie.test_table","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"default":null},{"name":"_hoodie_record_key","type":["null","string"],"default":null},{"name":"_hoodie_partition_path","type":["null","string"],"default":null},{"name":"_hoodie_file_name","type":["null","string"],"default":null},{"name":"key","type":["null","string"],"default":null},{"name":"partition","type":["null","string"],"default":null},{"name":"ts","type":["null","long"],"default":null},{"name":"textField","type":["null","string"],"default":null},{"name":"decimalField","type":["null","float"],"default":null},{"name":"longField","type":["null","long"],"default":null},{"name":"arrayField","type":["null",{"type":"array","items":["null","int"]}],"default":null},{"name":"mapField","type":["null",{"type":"map","values":["null","int"]}],"default":null},{"name":"round","type":["null","int"],"default":null}]}
{"type":"record","name":"test_table_record","namespace":"hoodie.test_table","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"default":null},{"name":"_hoodie_record_key","type":["null","string"],"default":null},{"name":"_hoodie_partition_path","type":["null","string"],"default":null},{"name":"_hoodie_file_name","type":["null","string"],"default":null},{"name":"key","type":["null","string"],"default":null},{"name":"partition","type":["null","string"],"default":null},{"name":"ts","type":["null","long"],"default":null},{"name":"textField","type":["null","string"],"default":null},{"name":"decimalField","type":["null","float"],"default":null},{"name":"longField","type":["null","long"],"default":null},{"name":"arrayField","type":["null",{"type":"array","items":["null","int"]}],"default":null},{"name":"mapField","type":["null",{"type":"map","values":["null","int"]}],"default":null},{"name":"round","type":["null","int"],"default":null}]}
+-------------------+---------------------+----------------------------------------+----------------------+-------------------------------------------------------------------------+----------------------------------------+----------+-------------+--------------------------------------------------------------------------------+------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|_hoodie_commit_time|_hoodie_commit_seqno |_hoodie_record_key
|_hoodie_partition_path|_hoodie_file_name
|key |partition |ts
|textField
|decimalField|longField |arrayField
|mapField
|round|
+-------------------+---------------------+----------------------------------------+----------------------+-------------------------------------------------------------------------+----------------------------------------+----------+-------------+--------------------------------------------------------------------------------+------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|20241004131408481
|20241004131408481_1_0|000-1c7d5894-1ea0-44b6-918f-e20e77dc6ea8|
|d05f1a83-0076-450c-8e33-33b534e60476-0_1-25-150_20241004131408481.parquet|000-1c7d5894-1ea0-44b6-918f-e20e77dc6ea8|2024/02/12|1708030565977|abcdefghijklmnopqrstuvwxyz|abcdefghijklmnopqrstuvwxyz|abcdefghijklmnopqrstuvwxyz|0.008004248
|3222960758887408274|[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
76, 77, 78, 79]|{fcb17aea-9ee9-45e4-83ea-b7a164601be0 -> -367158012,
124ce652-27d3-4612-b143-bb713de66923 -> -639554960,
0b0e10a4-52da-4f3e-a916-093c309f2ea7 -> -781039061,
5978c88c-645a-494e-97e0-29b7dd7be477 -> -726321438,
6623b267-791c-439f-a194-4deaf544757c -> -1337824530}|0 |
+-------------------+---------------------+----------------------------------------+----------------------+-------------------------------------------------------------------------+----------------------------------------+----------+-------------+--------------------------------------------------------------------------------+------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
{code}
> Fix IOException on the File group reader path
> ---------------------------------------------
>
> Key: HUDI-7276
> URL: https://issues.apache.org/jira/browse/HUDI-7276
> Project: Apache Hudi
> Issue Type: Bug
> Components: spark
> Reporter: xy
> Assignee: Lin Liu
> Priority: Blocker
> Labels: hudi-1.0.0-beta2, pull-request-available
> Fix For: 1.0.0
>
>
> FILE_GROUP_READER_ENABLED should be disable for query
>
> java.io.IOException: com.esotericsoftware.kryo.KryoException:
> java.lang.NullPointerException
> Serialization trace:
> props (org.apache.avro.Schema$LongSchema)
> types (org.apache.avro.Schema$UnionSchema)
> schema (org.apache.avro.Schema$Field)
> fieldMap (org.apache.avro.Schema$RecordSchema)
> at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1453)
> at
> org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:226)
> at
> org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:103)
> at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
> at
> org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat.$anonfun$buildReaderWithPartitionValues$3(HoodieFileGroupReaderBasedParquetFileFormat.scala:149)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:117)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:165)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:94)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown
> Source)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
> Source)
> at
> org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at
> org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at
> org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
> at
> org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
> at org.apache.spark.scheduler.Task.run(Task.scala:131)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1480)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: com.esotericsoftware.kryo.KryoException:
> java.lang.NullPointerException
> Serialization trace:
> props (org.apache.avro.Schema$LongSchema)
> types (org.apache.avro.Schema$UnionSchema)
> schema (org.apache.avro.Schema$Field)
> fieldMap (org.apache.avro.Schema$RecordSchema)
> at
> com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:144)
> at
> com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:543)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:813)
> at
> com.esotericsoftware.kryo.serializers.CollectionSerializer.read(CollectionSerializer.java:134)
> at
> com.esotericsoftware.kryo.serializers.CollectionSerializer.read(CollectionSerializer.java:40)
> at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:731)
> at
> com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:125)
> at
> com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:543)
> at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:731)
> at
> com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:125)
> at
> com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:543)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:813)
> at
> com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:161)
> at
> com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:39)
> at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:731)
> at
> com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:125)
> at
> com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:543)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:813)
> at
> org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:306)
> at
> org.apache.spark.broadcast.TorrentBroadcast$.$anonfun$unBlockifyObject$4(TorrentBroadcast.scala:336)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1480)
> at
> org.apache.spark.broadcast.TorrentBroadcast$.unBlockifyObject(TorrentBroadcast.scala:338)
> at
> org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$4(TorrentBroadcast.scala:257)
> at scala.Option.getOrElse(Option.scala:189)
> at
> org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$2(TorrentBroadcast.scala:231)
> at org.apache.spark.util.KeyLock.withLock(KeyLock.scala:64)
> at
> org.apache.spark.broadcast.TorrentBroadcast.$anonfun$readBroadcastBlock$1(TorrentBroadcast.scala:226)
> at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1446)
> ... 24 more
> Caused by: java.lang.NullPointerException
> at org.apache.avro.JsonProperties$2.putIfAbsent(JsonProperties.java:159)
> at org.apache.avro.JsonProperties$2.put(JsonProperties.java:166)
> at org.apache.avro.JsonProperties$2.put(JsonProperties.java:151)
> at
> com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:162)
> at
> com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:39)
> at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:731)
> at
> com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:125)
> ... 51 more
>
> Driver stacktrace:
>
>
> display route: spark bulk insert,then merge into,at last query in sparksql
>
> spark version: 3.2.0
> Hudi version:1.0(master)
--
This message was sent by Atlassian Jira
(v8.20.10#820010)