Hi, I'm launching a Spark cluster with the spark-ec2 script and playing around in spark-shell. I'm running the same line of code over and over again, and getting different results, and sometimes exceptions. Towards the end, after I cache the first RDD, it gives me the correct result multiple times in a row before throwing an exception. How can I get correct behavior out of these operations on these RDDs?
scala> val targets = data.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[116] at sortBy at <console>:36 scala> targets.first res26: (String, Int) = (\bguns?\b,1253) scala> val targets = data map {_.REGEX} groupBy{identity} map { Function.tupled(_->_.size)} sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[125] at sortBy at <console>:36 scala> targets.first res27: (String, Int) = (nika,7) scala> val targets = data.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[134] at sortBy at <console>:36 scala> targets.first res28: (String, Int) = (\bcalientes?\b,6) scala> targets.sortBy(_._2,false).first java.lang.UnsupportedOperationException: empty collection scala> val targets = data.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false).cache targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[283] at sortBy at <console>:36 scala> targets.first res46: (String, Int) = (\bhurting\ yous?\b,8) scala> val targets = data.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false).cache targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[292] at sortBy at <console>:36 scala> targets.first java.lang.UnsupportedOperationException: empty collection scala> val targets = data.cache.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[301] at sortBy at <console>:36 scala> targets.first res48: (String, Int) = (\bguns?\b,1253) scala> val targets = data.cache.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[310] at sortBy at <console>:36 scala> targets.first res49: (String, Int) = (\bguns?\b,1253) scala> val targets = data.cache.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[319] at sortBy at <console>:36 scala> targets.first res50: (String, Int) = (\bguns?\b,1253) scala> val targets = data.cache.map(_.REGEX).groupBy(identity).map(Function.tupled(_->_.size)).sortBy(_._2,false) targets: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[328] at sortBy at <console>:36 scala> targets.first java.lang.UnsupportedOperationException: empty collection