[
https://issues.apache.org/jira/browse/HUDI-3749?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17516162#comment-17516162
]
sivabalan narayanan edited comment on HUDI-3749 at 4/1/22 11:09 PM:
--------------------------------------------------------------------
build hudi locally w/ latest master for -Dspark3.1 profile.
And tried to run it against EMR spark.
1. writes went through fine w/ metadata enabled
2. Read from hudi failed w/ NoClassDefFoundError
{code:java}
hudiDf.count
java.lang.NoClassDefFoundError:
org/apache/spark/sql/catalyst/expressions/AliasHelper
at
org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$2(BaseFileOnlyRelation.scala:89)
at
scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at
org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$1(BaseFileOnlyRelation.scala:86)
at scala.collection.immutable.Stream.flatMap(Stream.scala:489)
at
org.apache.hudi.BaseFileOnlyRelation.collectFileSplits(BaseFileOnlyRelation.scala:85)
at org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:198)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$apply$4(DataSourceStrategy.scala:298)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:331)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProjectRaw(DataSourceStrategy.scala:386)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProject(DataSourceStrategy.scala:330)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:298)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:365)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:94)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:94)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:107)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:104)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:100)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$writePlans$5(QueryExecution.scala:219)
at org.apache.spark.sql.catalyst.plans.QueryPlan$.append(QueryPlan.scala:381)
at
org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$writePlans(QueryExecution.scala:219)
at
org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:227)
at
org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:99)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:132)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:104)
at
org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:227)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:131)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3665)
at org.apache.spark.sql.Dataset.count(Dataset.scala:3020)
... 57 elided
Caused by: java.lang.ClassNotFoundException:
org.apache.spark.sql.catalyst.expressions.AliasHelper
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 136 more {code}
Code snippet used to test above
{code:java}
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import
org.apache.hudi.config.HoodieWriteConfig._sys.props.update("spark.ui.proxyBase",
"")
val df = spark.read.parquet("s3_input_data")
val basePath = "/tmp/hudi_tbl7"
df.write.format("hudi").
option(PRECOMBINE_FIELD_OPT_KEY, "created_at").
option(RECORDKEY_FIELD_OPT_KEY, "id").
option(PARTITIONPATH_FIELD_OPT_KEY, "type").
option(TABLE_NAME, "hudi_tbl1").
option("hoodie.embed.timeline.server","false").
mode(Overwrite).
save(basePath)
df.write.format("hudi").
option(PRECOMBINE_FIELD_OPT_KEY, "created_at").
option(RECORDKEY_FIELD_OPT_KEY, "id").
option(PARTITIONPATH_FIELD_OPT_KEY, "type").
option(TABLE_NAME, "hudi_tbl1").
option("hoodie.embed.timeline.server","false").
mode(Append).
save(basePath)
val hudiDf = spark.read.format("hudi").load(basePath)
hudiDf.count {code}
was (Author: shivnarayan):
build hudi locally w/ latest master for -Dspark3.1 profile.
And tried to run it against EMR spark.
1. writes went through fine w/ metadata enabled
2. Read from hudi failed w/ NoClassDefFoundError
{code:java}
hudiDf.count
java.lang.NoClassDefFoundError:
org/apache/spark/sql/catalyst/expressions/AliasHelper
at
org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$2(BaseFileOnlyRelation.scala:89)
at
scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at
org.apache.hudi.BaseFileOnlyRelation.$anonfun$collectFileSplits$1(BaseFileOnlyRelation.scala:86)
at scala.collection.immutable.Stream.flatMap(Stream.scala:489)
at
org.apache.hudi.BaseFileOnlyRelation.collectFileSplits(BaseFileOnlyRelation.scala:85)
at org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:198)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$apply$4(DataSourceStrategy.scala:298)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:331)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProjectRaw(DataSourceStrategy.scala:386)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProject(DataSourceStrategy.scala:330)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:298)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162)
at
scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:69)
at
org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:365)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:94)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:94)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:107)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:149)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:153)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:104)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:100)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$writePlans$5(QueryExecution.scala:219)
at org.apache.spark.sql.catalyst.plans.QueryPlan$.append(QueryPlan.scala:381)
at
org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$writePlans(QueryExecution.scala:219)
at
org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:227)
at
org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:99)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:132)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:104)
at
org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:227)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:131)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3665)
at org.apache.spark.sql.Dataset.count(Dataset.scala:3020)
... 57 elided
Caused by: java.lang.ClassNotFoundException:
org.apache.spark.sql.catalyst.expressions.AliasHelper
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 136 more {code}
Code snippet used to test above
{code:java}
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import
org.apache.hudi.config.HoodieWriteConfig._sys.props.update("spark.ui.proxyBase",
"")
val df = spark.read.parquet("s3_input_data")
val basePath = "/tmp/hudi_tbl7"df.write.format("hudi").
option(PRECOMBINE_FIELD_OPT_KEY, "created_at").
option(RECORDKEY_FIELD_OPT_KEY, "id").
option(PARTITIONPATH_FIELD_OPT_KEY, "type").
option(TABLE_NAME, "hudi_tbl1").
option("hoodie.embed.timeline.server","false").
mode(Overwrite).
save(basePath)
df.write.format("hudi").
option(PRECOMBINE_FIELD_OPT_KEY, "created_at").
option(RECORDKEY_FIELD_OPT_KEY, "id").
option(PARTITIONPATH_FIELD_OPT_KEY, "type").
option(TABLE_NAME, "hudi_tbl1").
option("hoodie.embed.timeline.server","false").
mode(Append).
save(basePath)
val hudiDf = spark.read.format("hudi").load(basePath)
hudiDf.count {code}
> Run latest hudi w/ EMR spark and report to aws folks
> ----------------------------------------------------
>
> Key: HUDI-3749
> URL: https://issues.apache.org/jira/browse/HUDI-3749
> Project: Apache Hudi
> Issue Type: Task
> Components: tests-ci
> Reporter: sivabalan narayanan
> Assignee: sivabalan narayanan
> Priority: Blocker
> Fix For: 0.11.0
>
>
--
This message was sent by Atlassian Jira
(v8.20.1#820001)