I reworked my app using your idea of throwing the data in a map. It looks
like it should work but I'm getting some strange errors and my job gets
terminated. I get a
"WARN TaskSchedulerImpl: Initial job has not accepted any resources; check
your cluster UI to ensure that workers are registered and have sufficient
memory"
and before that in the Spark output I receive a
"Exception in thread "main" org.apache.spark.SparkException: Job aborted due
to stage failure: Task 2.0:0 failed 1 times, most recent failure: Exception
failure in TID 3 on host localhost: org.apache.spark.SparkException: Job
aborted due to stage failure: Task 0.0:0 failed 4 times, most recent
failure: TID 7 on host cloudera01.local.company.com failed for unknown
reason
Driver stacktrace:
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
scala.Option.foreach(Option.scala:236)
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633)
org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207)
akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
akka.actor.ActorCell.invoke(ActorCell.scala:456)
akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
akka.dispatch.Mailbox.run(Mailbox.scala:219)
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at
scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)"
I am using CDH 5.1.2 and went and set my worker_max_heapsize to its default
val of 512MB and also the executor_total_max_heapsize to default value of 8
GB after first experiencing this failure after reading it helped cure this
problem for someone.
The code is below
object App {
def main(args: Array[String]) {
val ssc = new StreamingContext("local[2]", "Data", Seconds(20))
ssc.checkpoint("checkpoint")
val eventMap = scala.collection.immutable.Map("uShip.Events" -> 1)
val pipe = KafkaUtils.createStream(ssc,
"dockerrepo,dockerrepo,dockerrepo", "Cons1", eventMap).map(_._2)
val eventStream = pipe.map(data => {
parse(data)
}).map(json => {
implicit val formats = DefaultFormats
val eventName = (json \ "event").extractOpt[String]
Event(eventName.getOrElse("*** NO EVENT NAME ***"), json)
})
eventStream.foreachRDD(rdd => {
var eventMap: Map[String,
scala.collection.mutable.MutableList[org.json4s.JValue]] = Map()
val eventArray = rdd.toArray
eventArray.foreach({ event =>
if (eventMap.contains(event.EventName)) {
var eventList = eventMap.getOrElse(event.EventName, new
scala.collection.mutable.MutableList[org.json4s.JValue])
eventList += event.Payload
} else {
var eventList = new
scala.collection.mutable.MutableList[org.json4s.JValue] += event.Payload
eventMap += (event.EventName -> eventList)
}
})
rdd.foreachPartition(rdd => {
val conf = new
SparkConf().setMaster("spark://cloudera01.local.company.com:7077")
val sc = new SparkContext(conf)
eventMap.foreach(event => {
def uuid = java.util.UUID.randomUUID.toString
val eventRDD = sc.makeRDD(event._2)
eventRDD.saveAsTextFile("hdfs://cloudera01.local.company.com:8020/user/hdfs/"
+ event._1 + "/rdd=" + eventRDD.id + "_ID_" + uuid)
})
})
})
ssc.start()
ssc.awaitTermination()
}
}
--
View this message in context:
http://apache-spark-user-list.1001560.n3.nabble.com/Using-data-in-RDD-to-specify-HDFS-directory-to-write-to-tp18789p18974.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]