Hi guys, I'm using Spark 1.6.2 and faced some problem so I kindly ask you to help. Sometimes, when DAGScheduler tries to serialise pair <rdd, func> OOM exception is thrown inside closureSerializer.serialize() call (you may see a stack-trace below). But it isn't a problem itself, the problem is that Spark hangs after this has happened.
I've fixed the problem by adding OOM handling to try-catch statement inside submitMissingTasks() function and now when this happen, Spark is correctly finishes its work. But I noticed that Non-fatal are handling and Spark abort the task when such ones happen and there are a lot of places where NonFatal(e) is handled. So it looks like other types of errors are deliberately ignored. And as long as I don't clearly understand the reason why so, I'm not sure that the fix is correct. Could you please have a look and point me to a better solution for the issue? The fix: abort-task-on-oom-in-dag-scheduler.patch <http://apache-spark-developers-list.1001551.n3.nabble.com/file/n18639/abort-task-on-oom-in-dag-scheduler.patch> at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:1016) at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:921) at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1611) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1603) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1592) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:920) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918) at com.massivedatascience.util.SparkHelper$class.sync(SparkHelper.scala:39) at com.massivedatascience.clusterer.ColumnTrackingKMeans.sync(ColumnTrackingKMeans.scala:255) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$11.apply(ColumnTrackingKMeans.scala:457) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$11.apply(ColumnTrackingKMeans.scala:456) at com.massivedatascience.util.SparkHelper$class.withBroadcast(SparkHelper.scala:83) at com.massivedatascience.clusterer.ColumnTrackingKMeans.withBroadcast(ColumnTrackingKMeans.scala:255) at com.massivedatascience.clusterer.ColumnTrackingKMeans.com$massivedatascience$clusterer$ColumnTrackingKMeans$$lloyds$1(ColumnTrackingKMeans.scala:456) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$cluster$3$$anonfun$apply$5.apply(ColumnTrackingKMeans.scala:485) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$cluster$3$$anonfun$apply$5.apply(ColumnTrackingKMeans.scala:480) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.immutable.List.foreach(List.scala:318) at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) at scala.collection.AbstractTraversable.map(Traversable.scala:105) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$cluster$3.apply(ColumnTrackingKMeans.scala:480) at com.massivedatascience.clusterer.ColumnTrackingKMeans$$anonfun$cluster$3.apply(ColumnTrackingKMeans.scala:479) at com.massivedatascience.util.SparkHelper$class.withCached(SparkHelper.scala:71) at com.massivedatascience.clusterer.ColumnTrackingKMeans.withCached(ColumnTrackingKMeans.scala:255) at com.massivedatascience.clusterer.ColumnTrackingKMeans.cluster(ColumnTrackingKMeans.scala:479) at com.massivedatascience.clusterer.MultiKMeansClusterer$class.best(MultiKMeansClusterer.scala:37) at com.massivedatascience.clusterer.ColumnTrackingKMeans.best(ColumnTrackingKMeans.scala:255) at com.massivedatascience.clusterer.KMeans$.simpleTrain(KMeans.scala:168) at com.massivedatascience.clusterer.KMeans$.iterativelyTrain(KMeans.scala:249) at com.massivedatascience.clusterer.KMeans$$anonfun$trainWeighted$1.apply(KMeans.scala:111) at com.massivedatascience.clusterer.KMeans$$anonfun$trainWeighted$1.apply(KMeans.scala:110) at com.massivedatascience.util.SparkHelper$class.withCached(SparkHelper.scala:55) at com.massivedatascience.clusterer.KMeans$.withCached(KMeans.scala:32) at com.massivedatascience.clusterer.KMeans$.trainWeighted(KMeans.scala:110) at com.massivedatascience.clusterer.KMeans$$anonfun$train$2.apply(KMeans.scala:81) at com.massivedatascience.clusterer.KMeans$$anonfun$train$2.apply(KMeans.scala:77) at com.massivedatascience.util.SparkHelper$class.withCached(SparkHelper.scala:71) at com.massivedatascience.clusterer.KMeans$.withCached(KMeans.scala:32) at com.massivedatascience.clusterer.KMeans$.train(KMeans.scala:77) at com.badoo.antispam.clustering.InvokeKMeans$.main(InvokeKmeans.scala:175) at com.badoo.antispam.clustering.InvokeKMeans.main(InvokeKmeans.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:558) Caused by: java.lang.OutOfMemoryError: Java heap space at java.util.Arrays.copyOf(Arrays.java:3332) at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:137) at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:121) at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:421) at java.lang.StringBuilder.append(StringBuilder.java:136) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1421) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348) at scala.collection.immutable.$colon$colon.writeObject(List.scala:379) at sun.reflect.GeneratedMethodAccessor93.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) -- View this message in context: http://apache-spark-developers-list.1001551.n3.nabble.com/Spark-hangs-after-OOM-in-Serializer-tp18639.html Sent from the Apache Spark Developers List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe e-mail: dev-unsubscr...@spark.apache.org