[ 
https://issues.apache.org/jira/browse/HUDI-4509?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xi chaomin updated HUDI-4509:
-----------------------------
    Description: 
When I read hudi table with spark readStream, FileNotFoundException happens 
after a while
{code:java}
    val df: DataFrame = spark.readStream.format("org.apache.hudi")
      .load("hdfs://10.19.29.148:8020/tmp/hudi/ss_bucket")
    df.writeStream
      .format("console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", "/tmp/hoodie/checkpoint_xicm")
      .start()
      .awaitTermination()
{code}



Caused by: org.apache.hudi.exception.HoodieException: Exception when reading 
log file 
        at 
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:352)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scan(AbstractHoodieLogRecordReader.java:192)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:110)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:103)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:324)
 ~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:402) 
~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:196)
 ~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:124) 
~[classes/:?]
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.scheduler.Task.run(Task.scala:123) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
 ~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
~[?:1.8.0_211]
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 
~[?:1.8.0_211]
        at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_211]
Caused by: org.apache.hudi.exception.HoodieIOException: IOException when 
reading logblock from log file 
HoodieLogFile{pathStr='hdfs://10.19.29.148:8020/tmp/hudi/ss_bucket/.00000000-cfd8-4973-badf-2cf7d5853cab-0_20220729172717399.log.1_0-1033-1923',
 fileLen=-1}
        at 
org.apache.hudi.common.table.log.HoodieLogFileReader.next(HoodieLogFileReader.java:389)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieLogFormatReader.next(HoodieLogFormatReader.java:123)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:229)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scan(AbstractHoodieLogRecordReader.java:192)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:110)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:103)
 ~[classes/:?]
        at 
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:324)
 ~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:402) 
~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:196)
 ~[classes/:?]
        at 
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:124) 
~[classes/:?]
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.scheduler.Task.run(Task.scala:123) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
 ~[spark-core_2.11-2.4.4.jar:2.4.4]
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 
~[spark-core_2.11-2.4.4.jar:2.4.4]
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
~[?:1.8.0_211]
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 
~[?:1.8.0_211]
        at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_211]
Caused by: java.io.FileNotFoundException: File does not exist: 
/tmp/hudi/ss_bucket/.00000000-cfd8-4973-badf-2cf7d5853cab-0_20220729172717399.log.1_0-1033-1923
        at 
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
        at 
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)

  was:
When I read hudi table with spark readStream, FileNotFoundException happens 
after a while
{code:java}
    val df: DataFrame = spark.readStream.format("org.apache.hudi")
      .load("hdfs://10.19.29.148:8020/tmp/hudi/ss_bucket")
    df.writeStream
      .format("console")
      .trigger(Trigger.ProcessingTime("5 seconds"))
      .option("checkpointLocation", "/tmp/hoodie/checkpoint_xicm")
      .start()
      .awaitTermination()
{code}



> FileNotFoundException during doing readStream
> ---------------------------------------------
>
>                 Key: HUDI-4509
>                 URL: https://issues.apache.org/jira/browse/HUDI-4509
>             Project: Apache Hudi
>          Issue Type: Bug
>            Reporter: xi chaomin
>            Priority: Major
>
> When I read hudi table with spark readStream, FileNotFoundException happens 
> after a while
> {code:java}
>     val df: DataFrame = spark.readStream.format("org.apache.hudi")
>       .load("hdfs://10.19.29.148:8020/tmp/hudi/ss_bucket")
>     df.writeStream
>       .format("console")
>       .trigger(Trigger.ProcessingTime("5 seconds"))
>       .option("checkpointLocation", "/tmp/hoodie/checkpoint_xicm")
>       .start()
>       .awaitTermination()
> {code}
> Caused by: org.apache.hudi.exception.HoodieException: Exception when reading 
> log file 
>       at 
> org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:352)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scan(AbstractHoodieLogRecordReader.java:192)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:110)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:103)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:324)
>  ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:402) 
> ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:196)
>  ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:124) 
> ~[classes/:?]
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.scheduler.Task.run(Task.scala:123) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
>  ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  ~[?:1.8.0_211]
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  ~[?:1.8.0_211]
>       at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_211]
> Caused by: org.apache.hudi.exception.HoodieIOException: IOException when 
> reading logblock from log file 
> HoodieLogFile{pathStr='hdfs://10.19.29.148:8020/tmp/hudi/ss_bucket/.00000000-cfd8-4973-badf-2cf7d5853cab-0_20220729172717399.log.1_0-1033-1923',
>  fileLen=-1}
>       at 
> org.apache.hudi.common.table.log.HoodieLogFileReader.next(HoodieLogFileReader.java:389)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieLogFormatReader.next(HoodieLogFormatReader.java:123)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:229)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scan(AbstractHoodieLogRecordReader.java:192)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:110)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:103)
>  ~[classes/:?]
>       at 
> org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:324)
>  ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:402) 
> ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:196)
>  ~[classes/:?]
>       at 
> org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:124) 
> ~[classes/:?]
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.scheduler.Task.run(Task.scala:123) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
>  ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) 
> ~[spark-core_2.11-2.4.4.jar:2.4.4]
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  ~[?:1.8.0_211]
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  ~[?:1.8.0_211]
>       at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_211]
> Caused by: java.io.FileNotFoundException: File does not exist: 
> /tmp/hudi/ss_bucket/.00000000-cfd8-4973-badf-2cf7d5853cab-0_20220729172717399.log.1_0-1033-1923
>       at 
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
>       at 
> org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to