There is an open Jira for this issue ( https://issues.apache.org/jira/browse/SPARK-14804). There have been a few proposed fixes so far.
On Thu, Nov 3, 2016 at 2:20 PM, jamborta <jambo...@gmail.com> wrote: > Hi there, > > I am trying to run the example LDA code > (http://spark.apache.org/docs/latest/mllib-clustering.html# > latent-dirichlet-allocation-lda) > on Spark 2.0.0/EMR 5.0.0 > > If run it with checkpoints enabled (sc.setCheckpointDir("s3n://s3-path/") > > ldaModel = LDA.train(corpus, k=3, maxIterations=200, checkpointInterval=10) > > I get the following error (sorry, quite long): > > Py4JJavaErrorTraceback (most recent call last) > <ipython-input-10-64711b08964e> in <module>() > ----> 1 ldaModel = LDA.train(corpus, k=3, maxIterations=200, > checkpointInterval=10) > > /usr/lib/spark/python/pyspark/mllib/clustering.py in train(cls, rdd, k, > maxIterations, docConcentration, topicConcentration, seed, > checkpointInterval, optimizer) > 1037 model = callMLlibFunc("trainLDAModel", rdd, k, > maxIterations, > 1038 docConcentration, topicConcentration, > seed, > -> 1039 checkpointInterval, optimizer) > 1040 return LDAModel(model) > 1041 > > /usr/lib/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, > *args) > 128 sc = SparkContext.getOrCreate() > 129 api = getattr(sc._jvm.PythonMLLibAPI(), name) > --> 130 return callJavaFunc(sc, api, *args) > 131 > 132 > > /usr/lib/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, > *args) > 121 """ Call Java Function """ > 122 args = [_py2java(sc, a) for a in args] > --> 123 return _java2py(sc, func(*args)) > 124 > 125 > > /usr/lib/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in > __call__(self, *args) > 931 answer = self.gateway_client.send_command(command) > 932 return_value = get_return_value( > --> 933 answer, self.gateway_client, self.target_id, self.name > ) > 934 > 935 for temp_arg in temp_args: > > /usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw) > 61 def deco(*a, **kw): > 62 try: > ---> 63 return f(*a, **kw) > 64 except py4j.protocol.Py4JJavaError as e: > 65 s = e.java_exception.toString() > > /usr/lib/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 310 raise Py4JJavaError( > 311 "An error occurred while calling {0}{1}{2}.\n". > --> 312 format(target_id, ".", name), value) > 313 else: > 314 raise Py4JError( > > Py4JJavaError: An error occurred while calling o115.trainLDAModel. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task > 1 > in stage 458.0 failed 4 times, most recent failure: Lost task 1.3 in stage > 458.0 (TID 14827, ip-10-197-192-2.eu-west-1.compute.internal): > java.lang.ClassCastException: scala.Tuple2 cannot be cast to > org.apache.spark.graphx.Edge > at > org.apache.spark.graphx.EdgeRDD$$anonfun$1$$anonfun$ > apply$1.apply(EdgeRDD.scala:107) > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > at > org.apache.spark.InterruptibleIterator.foreach( > InterruptibleIterator.scala:28) > at org.apache.spark.graphx.EdgeRDD$$anonfun$1.apply( > EdgeRDD.scala:107) > at org.apache.spark.graphx.EdgeRDD$$anonfun$1.apply( > EdgeRDD.scala:105) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$ > anonfun$apply$25.apply(RDD.scala:801) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$ > anonfun$apply$25.apply(RDD.scala:801) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:332) > at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:330) > at > org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply( > BlockManager.scala:919) > at > org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply( > BlockManager.scala:910) > at org.apache.spark.storage.BlockManager.doPut( > BlockManager.scala:866) > at > org.apache.spark.storage.BlockManager.doPutIterator( > BlockManager.scala:910) > at > org.apache.spark.storage.BlockManager.getOrElseUpdate( > BlockManager.scala:668) > at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:281) > at org.apache.spark.graphx.EdgeRDD.compute(EdgeRDD.scala:50) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run( > Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker( > ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run( > ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$ > scheduler$DAGScheduler$$failJobAndIndependentStages( > DAGScheduler.scala:1450) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply( > DAGScheduler.scala:1438) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply( > DAGScheduler.scala:1437) > at > scala.collection.mutable.ResizableArray$class.foreach( > ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach( > ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage( > DAGScheduler.scala:1437) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$ > handleTaskSetFailed$1.apply(DAGScheduler.scala:811) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$ > handleTaskSetFailed$1.apply(DAGScheduler.scala:811) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed( > DAGScheduler.scala:811) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop. > doOnReceive(DAGScheduler.scala:1659) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop. > onReceive(DAGScheduler.scala:1618) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop. > onReceive(DAGScheduler.scala:1607) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at org.apache.spark.scheduler.DAGScheduler.runJob( > DAGScheduler.scala:632) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1934) > at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1046) > at > org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) > at org.apache.spark.rdd.RDD.fold(RDD.scala:1040) > at > org.apache.spark.mllib.clustering.EMLDAOptimizer.computeGlobalTopicTotals( > LDAOptimizer.scala:226) > at > org.apache.spark.mllib.clustering.EMLDAOptimizer. > next(LDAOptimizer.scala:213) > at > org.apache.spark.mllib.clustering.EMLDAOptimizer. > next(LDAOptimizer.scala:79) > at org.apache.spark.mllib.clustering.LDA.run(LDA.scala:299) > at > org.apache.spark.mllib.api.python.PythonMLLibAPI. > trainLDAModel(PythonMLLibAPI.scala:552) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java: > 62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke( > DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237) > at py4j.reflection.ReflectionEngine.invoke( > ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:280) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand. > java:128) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:211) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.lang.ClassCastException: scala.Tuple2 cannot be cast to > org.apache.spark.graphx.Edge > at > org.apache.spark.graphx.EdgeRDD$$anonfun$1$$anonfun$ > apply$1.apply(EdgeRDD.scala:107) > at scala.collection.Iterator$class.foreach(Iterator.scala:893) > at > org.apache.spark.InterruptibleIterator.foreach( > InterruptibleIterator.scala:28) > at org.apache.spark.graphx.EdgeRDD$$anonfun$1.apply( > EdgeRDD.scala:107) > at org.apache.spark.graphx.EdgeRDD$$anonfun$1.apply( > EdgeRDD.scala:105) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$ > anonfun$apply$25.apply(RDD.scala:801) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$ > anonfun$apply$25.apply(RDD.scala:801) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:332) > at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:330) > at > org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply( > BlockManager.scala:919) > at > org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply( > BlockManager.scala:910) > at org.apache.spark.storage.BlockManager.doPut( > BlockManager.scala:866) > at > org.apache.spark.storage.BlockManager.doPutIterator( > BlockManager.scala:910) > at > org.apache.spark.storage.BlockManager.getOrElseUpdate( > BlockManager.scala:668) > at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:281) > at org.apache.spark.graphx.EdgeRDD.compute(EdgeRDD.scala:50) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.rdd.MapPartitionsRDD.compute( > MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run( > Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker( > ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run( > ThreadPoolExecutor.java:617) > ... 1 more > > > > > -- > View this message in context: http://apache-spark-user-list. > 1001560.n3.nabble.com/example-LDA-code-ClassCastException-tp28009.html > Sent from the Apache Spark User List mailing list archive at Nabble.com. > > --------------------------------------------------------------------- > To unsubscribe e-mail: user-unsubscr...@spark.apache.org > > -- Asher Krim Senior Software Engineer