Howdy,
I'm a relative novice at Spark/Scala and I'm puzzled by some behavior that
I'm seeing in 2 of my local Spark/Scala environments (Scala for Jupyter and
Scala IDE) but not the 3rd (Spark Shell). The following code throws the
following stack trace error in the former 2 environments but executes
successfully in the 3rd. I'm not sure how to go about troubleshooting my
former 2 environments so any assistance is greatly appreciated.
Code:
//get file
val logFile = "s3n://file"
val logData = sc.textFile(logFile)
// header
val header = logData.first
// filter out header
val sample = logData.filter(!_.contains(header)).map {
line => line.replaceAll("['\"]","").substring(0,line.length()-1)
}.takeSample(false,100,12L)
Stack Trace:
org.apache.spark.SparkException: Task not serializable
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:315)
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305)
org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132)
org.apache.spark.SparkContext.clean(SparkContext.scala:1893)
org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311)
org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
org.apache.spark.rdd.RDD.withScope(RDD.scala:286)
org.apache.spark.rdd.RDD.filter(RDD.scala:310)
cmd6$$user$$anonfun$3.apply(Main.scala:134)
cmd6$$user$$anonfun$3.apply(Main.scala:133)
java.io.NotSerializableException: org.apache.spark.SparkConf
Serialization stack:
- object not serializable (class: org.apache.spark.SparkConf, value:
org.apache.spark.SparkConf@309ed441)
- field (class: cmd2$$user, name: conf, type: class
org.apache.spark.SparkConf)
- object (class cmd2$$user, cmd2$$user@75a88665)
- field (class: cmd6, name: $ref$cmd2, type: class cmd2$$user)
- object (class cmd6, cmd6@5e9e8f0b)
- field (class: cmd6$$user, name: $outer, type: class cmd6)
- object (class cmd6$$user, cmd6$$user@692f81c)
- field (class: cmd6$$user$$anonfun$3, name: $outer, type: class
cmd6$$user)
- object (class cmd6$$user$$anonfun$3, <function0>)
- field (class: cmd6$$user$$anonfun$3$$anonfun$apply$1, name: $outer,
type: class cmd6$$user$$anonfun$3)
- object (class cmd6$$user$$anonfun$3$$anonfun$apply$1, <function1>)
org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:81)
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:312)
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305)
org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132)
org.apache.spark.SparkContext.clean(SparkContext.scala:1893)
org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311)
org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
org.apache.spark.rdd.RDD.withScope(RDD.scala:286)
org.apache.spark.rdd.RDD.filter(RDD.scala:310)
cmd6$$user$$anonfun$3.apply(Main.scala:134)
cmd6$$user$$anonfun$3.apply(Main.scala:133)
Thanks,
Balaji