[SUPPORT] NullPointerException in HoodieROTablePathFilter Hudi 0.10.0
See original GitHub issueDescribe the problem you faced
NullPointerException in HoodieROTablePathFilter while querying Hudi table using 0.10.0 that is working with 0.9.0
Workaround:
Undo: https://github.com/apache/hudi/pull/3719
To Reproduce
Steps to reproduce the behavior:
- Create COW table date partitioned
- Query for 2 months worth of partitions
Query sample
def createViews(startDate: LocalDate, lastDate: LocalDate) = {
logger.info("Creating Views")
logger.info(s"Converting ${startDate} and ${lastDate}")
val nextMonthDate = lastDate.minusDays(1).plusMonths(1)
val endOfNextMonthDate = nextMonthDate.withDayOfMonth(nextMonthDate.lengthOfMonth())
val currentDate = LocalDate.now()
val dayBeforeStart = startDate.minusDays(1)
val completesDateRange = dayStream(startDate, lastDate).toList
val completesPaths = completesDateRange
.map(date => s"${hudiBasePath}/completes_by_completedate/completetime=${date.format(sd)}/*")
.filter(p => testDirExists(p.dropRight(1)))
val paths = completesPaths.mkString(",")
val view = spark.read
.format("org.apache.hudi")
.option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
.option(DataSourceReadOptions.READ_PATHS.key(), paths)
.load()
view.createOrReplaceTempView("completes")
}
s"""
|SELECT bc.responseid AS ResponseID,
|FROM completes bc
|WHERE bc.CompleteTime BETWEEN '${startDate}' AND '${stopDate}'
|ORDER BY bc.completetime DESC
|""".stripMargin
Expected behavior
A clear and concise description of what you expected to happen.
Environment Description
-
Hudi version : 0.10.0
-
Spark version : 3.1.2
-
Hive version :
-
Hadoop version : 3.2.1
-
Storage (HDFS/S3/GCS…) : S3
-
Running on Docker? (yes/no) : no
Stacktrace
Exception in thread “main” org.apache.spark.SparkException: Job aborted due to stage failure: Task 34 in stage 0.0 failed 10 times, most recent failure: Lost task 34.9 in stage 0.0 (TID 336) (<host> executor 7): org.apache.hudi.exception.HoodieException: Error checking path :s3a://<path>/completetime=2021/10/10/50c3f98a-bf59-45d1-a01e-602f42f13ed9-0_651-10-4279_20211202004113620.parquet, under folder: s3a://<path>/completetime=2021/10/10 at org.apache.hudi.hadoop.HoodieROTablePathFilter.accept(HoodieROTablePathFilter.java:230) at org.apache.spark.sql.execution.datasources.PathFilterWrapper.accept(InMemoryFileIndex.scala:227) at org.apache.spark.util.HadoopFSUtils$.$anonfun$listLeafFiles$8(HadoopFSUtils.scala:318) at org.apache.spark.util.HadoopFSUtils$.$anonfun$listLeafFiles$8$adapted(HadoopFSUtils.scala:318) at scala.collection.TraversableLike.$anonfun$filterImpl$1(TraversableLike.scala:256) at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255) at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249) at scala.collection.mutable.ArrayOps$ofRef.filterImpl(ArrayOps.scala:198) at scala.collection.TraversableLike.filter(TraversableLike.scala:347) at scala.collection.TraversableLike.filter$(TraversableLike.scala:347) at scala.collection.mutable.ArrayOps$ofRef.filter(ArrayOps.scala:198) at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:318) at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$6(HadoopFSUtils.scala:138) at scala.collection.immutable.Stream.map(Stream.scala:418) at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$4(HadoopFSUtils.scala:128) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.NullPointerException at org.apache.hudi.hadoop.HoodieROTablePathFilter.accept(HoodieROTablePathFilter.java:185) … 33 more
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2465) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2414) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2413) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2413) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2679) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2621) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2610) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:914) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303) at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:414) at org.apache.spark.rdd.RDD.collect(RDD.scala:1029) at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:141) at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:71) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:220) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.defaultBulkList$1(InMemoryFileIndex.scala:171) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.$anonfun$bulkListLeafFiles$1(InMemoryFileIndex.scala:191) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.withListObserver(InMemoryFileIndex.scala:197) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.bulkListLeafFiles(InMemoryFileIndex.scala:168) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:155) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:119) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:91) at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:77) at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:578) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:416) at org.apache.hudi.DefaultSource.getBaseFileOnlyView(DefaultSource.scala:241) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:117) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:67) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:326) at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:308) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:308) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:226) at com.lucidhq.etl.reports.monthly.HudiExtractAndLoad.$anonfun$createViews$7(HudiExtractAndLoad.scala:104) at com.lucidhq.etl.reports.monthly.HudiExtractAndLoad.$anonfun$createViews$7$adapted(HudiExtractAndLoad.scala:96) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) at scala.collection.parallel.ParIterableLike$Foreach.leaf(ParIterableLike.scala:974) at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67) at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56) at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50) at scala.collection.parallel.ParIterableLike$Foreach.tryLeaf(ParIterableLike.scala:971) at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160) at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157) at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440) at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150) at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149) at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440) at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189) at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) Suppressed: org.apache.spark.SparkException: Job aborted due to stage failure: Task 29 in stage 3.0 failed 10 times, most recent failure: Lost task 29.9 in stage 3.0 (TID 1442) (<host> executor 7): org.apache.hudi.exception.HoodieException: Error checking path :s3a://<path>/journaldate=2021/10/08/fa24b593-90cc-4812-bca6-1d05eb54f139-0_214-12-3638_20211202004112402.parquet, under folder: s3a://<path>/journaldate=2021/10/08 at org.apache.hudi.hadoop.HoodieROTablePathFilter.accept(HoodieROTablePathFilter.java:230) at org.apache.spark.sql.execution.datasources.PathFilterWrapper.accept(InMemoryFileIndex.scala:227) at org.apache.spark.util.HadoopFSUtils$.$anonfun$listLeafFiles$8(HadoopFSUtils.scala:318) at org.apache.spark.util.HadoopFSUtils$.$anonfun$listLeafFiles$8$adapted(HadoopFSUtils.scala:318) at scala.collection.TraversableLike.$anonfun$filterImpl$1(TraversableLike.scala:256) at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255) at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249) at scala.collection.mutable.ArrayOps$ofRef.filterImpl(ArrayOps.scala:198) at scala.collection.TraversableLike.filter(TraversableLike.scala:347) at scala.collection.TraversableLike.filter$(TraversableLike.scala:347) at scala.collection.mutable.ArrayOps$ofRef.filter(ArrayOps.scala:198) at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:318) at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$6(HadoopFSUtils.scala:138) at scala.collection.immutable.Stream.map(Stream.scala:418) at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$4(HadoopFSUtils.scala:128) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.NullPointerException at org.apache.hudi.hadoop.HoodieROTablePathFilter.accept(HoodieROTablePathFilter.java:185)
Issue Analytics
- State:
- Created 2 years ago
- Comments:10 (10 by maintainers)
Top GitHub Comments
Thanks, I will have to wait for my team. will test it tomorrow and update the result here.
Closing the issue as its fixed with the revert PR.