I am testing again on a 64 node cluster (the JobManager is running fine having reduced some operator's parallelism and fixed the string conversion performance).
I am seeing TaskManagers drop like flies every other job or so. I am not seeing any output in the .out log files corresponding to the crashed TaskManagers. Below is the stack trace from a java.hprof heap dump. How should I be debugging this? Thanks, Greg Threads at the heap dump: Unknown thread "Memory Logger" daemon prio=1 tid=119 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:507) at org.apache.flink.runtime.taskmanager.MemoryLogger.<init>(MemoryLogger.java:67) at org.apache.flink.runtime.taskmanager.TaskManager$.runTaskManager(TaskManager.scala:1494) at org.apache.flink.runtime.taskmanager.TaskManager$.selectNetworkInterfaceAndRunTaskManager(TaskManager.scala:1330) "Flink Netty Server (59693) Thread 0" daemon prio=5 tid=193 RUNNABLE at java.lang.Thread.<init>(Thread.java:674) at java.util.concurrent.Executors$DefaultThreadFactory.newThread(Executors.java:613) at org.apache.flink.shaded.com.google.common.util.concurrent.ThreadFactoryBuilder$1.newThread(ThreadFactoryBuilder.java:162) at io.netty.util.concurrent.SingleThreadEventExecutor.<init>(SingleThreadEventExecutor.java:106) "flink-akka.remote.default-remote-dispatcher-6" daemon prio=5 tid=30 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:507) at scala.concurrent.forkjoin.ForkJoinWorkerThread.<init>(ForkJoinWorkerThread.java:48) at akka.dispatch.MonitorableThreadFactory$AkkaForkJoinWorkerThread.<init>(ThreadPoolBuilder.scala:164) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:187) "flink-akka.actor.default-dispatcher-4" daemon prio=5 tid=28 WAITING at java.lang.Thread.<init>(Thread.java:507) at scala.concurrent.forkjoin.ForkJoinWorkerThread.<init>(ForkJoinWorkerThread.java:48) at akka.dispatch.MonitorableThreadFactory$AkkaForkJoinWorkerThread.<init>(ThreadPoolBuilder.scala:164) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:187) "flink-akka.remote.default-remote-dispatcher-5" daemon prio=5 tid=29 WAITING at java.lang.Thread.<init>(Thread.java:507) at scala.concurrent.forkjoin.ForkJoinWorkerThread.<init>(ForkJoinWorkerThread.java:48) at akka.dispatch.MonitorableThreadFactory$AkkaForkJoinWorkerThread.<init>(ThreadPoolBuilder.scala:164) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:187) "flink-akka.actor.default-dispatcher-2" daemon prio=5 tid=26 WAITING at java.lang.Thread.<init>(Thread.java:507) at scala.concurrent.forkjoin.ForkJoinWorkerThread.<init>(ForkJoinWorkerThread.java:48) at akka.dispatch.MonitorableThreadFactory$AkkaForkJoinWorkerThread.<init>(ThreadPoolBuilder.scala:164) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:187) "SIGTERM handler" daemon prio=9 tid=268 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at sun.misc.Signal.dispatch(Signal.java:216) "HPROF gc_finish watcher" daemon prio=10 tid=5 RUNNABLE "Reference Handler" daemon prio=10 tid=2 WAITING "main" prio=5 tid=1 WAITING "Signal Dispatcher" daemon prio=9 tid=4 RUNNABLE "Finalizer" daemon prio=8 tid=3 WAITING "flink-akka.actor.default-dispatcher-3" daemon prio=5 tid=27 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:507) at scala.concurrent.forkjoin.ForkJoinWorkerThread.<init>(ForkJoinWorkerThread.java:48) at akka.dispatch.MonitorableThreadFactory$AkkaForkJoinWorkerThread.<init>(ThreadPoolBuilder.scala:164) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:187) "New I/O worker #1" daemon prio=5 tid=31 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925) "flink-scheduler-1" daemon prio=5 tid=25 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at akka.actor.LightArrayRevolverScheduler.<init>(Scheduler.scala:337) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(NativeConstructorAccessorImpl.java) "New I/O worker #2" daemon prio=5 tid=32 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925) "Hashed wheel timer #1" daemon prio=5 tid=33 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at org.jboss.netty.util.HashedWheelTimer.<init>(HashedWheelTimer.java:226) Local Variable: java.util.ArrayList#502 at org.jboss.netty.util.HashedWheelTimer.<init>(HashedWheelTimer.java:177) Local Variable: java.lang.String#15234 "New I/O boss #3" daemon prio=5 tid=34 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925) "Timer-0" daemon prio=5 tid=267 TIMED_WAITING at java.lang.Thread.<init>(Thread.java:444) at java.util.TimerThread.<init>(Timer.java:499) at java.util.Timer.<init>(Timer.java:101) at java.util.Timer.<init>(Timer.java:146) "New I/O worker #4" daemon prio=5 tid=35 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925) "New I/O worker #5" daemon prio=5 tid=36 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925) "New I/O server boss #6" daemon prio=5 tid=37 RUNNABLE at java.lang.Thread.<init>(Thread.java:547) at akka.dispatch.MonitorableThreadFactory.newThread(ThreadPoolBuilder.scala:193) at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(ThreadPoolExecutor.java:612) at java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:925)