Is the string of the above JSON object in the same line? jsonFile requires that every line is a JSON object or an array of JSON objects.
On Mon, Mar 2, 2015 at 11:28 AM, kpeng1 <kpe...@gmail.com> wrote: > Hi All, > > I am currently having issues reading in a json file using spark sql's api. > Here is what the json file looks like: > { > "namespace": "spacey", > "name": "namer", > "type": "record", > "fields": [ > {"name":"f1","type":["null","string"]}, > {"name":"f2","type":["null","string"]}, > {"name":"f3","type":["null","string"]}, > {"name":"f4","type":["null","string"]}, > {"name":"f5","type":["null","string"]}, > {"name":"f6","type":["null","string"]}, > {"name":"f7","type":["null","string"]}, > {"name":"f8","type":["null","string"]}, > {"name":"f9","type":["null","string"]}, > {"name":"f10","type":["null","string"]}, > {"name":"f11","type":["null","string"]}, > {"name":"f12","type":["null","string"]}, > {"name":"f13","type":["null","string"]}, > {"name":"f14","type":["null","string"]}, > {"name":"f15","type":["null","string"]} > ] > } > > This is what I am doing to read in the json file(using spark sql in the > spark shell on CDH5.3): > > val sqlsc = new org.apache.spark.sql.SQLContext(sc) > val j = sqlsc.jsonFile("/tmp/try.avsc") > > > This is what I am getting as an error: > > 15/03/02 11:23:45 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 12, > 10.0.2.15): scala.MatchError: namespace (of class java.lang.String) > at > > org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:305) > at > > org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:303) > at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at scala.collection.Iterator$class.foreach(Iterator.scala:727) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) > at > > scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:172) > at > scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1157) > at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:853) > at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:851) > at > org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) > at > org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) > at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:56) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196) > at > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > > 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.1 in stage 3.0 (TID > 14, 10.0.2.15, ANY, 1308 bytes) > 15/03/02 11:23:45 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID > 13) in 128 ms on 10.0.2.15 (1/2) > 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.1 in stage 3.0 (TID 14) > on executor 10.0.2.15: scala.MatchError (namespace (of class > java.lang.String)) [duplicate 1] > 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.2 in stage 3.0 (TID > 15, 10.0.2.15, ANY, 1308 bytes) > 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.2 in stage 3.0 (TID 15) > on executor 10.0.2.15: scala.MatchError (namespace (of class > java.lang.String)) [duplicate 2] > 15/03/02 11:23:45 INFO TaskSetManager: Starting task 0.3 in stage 3.0 (TID > 16, 10.0.2.15, ANY, 1308 bytes) > 15/03/02 11:23:45 INFO TaskSetManager: Lost task 0.3 in stage 3.0 (TID 16) > on executor 10.0.2.15: scala.MatchError (namespace (of class > java.lang.String)) [duplicate 3] > 15/03/02 11:23:45 ERROR TaskSetManager: Task 0 in stage 3.0 failed 4 times; > aborting job > 15/03/02 11:23:45 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks > have all completed, from pool > 15/03/02 11:23:45 INFO TaskSchedulerImpl: Cancelling stage 3 > 15/03/02 11:23:45 INFO DAGScheduler: Job 3 failed: reduce at > JsonRDD.scala:57, took 0.210707 s > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 > in > stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 > (TID 16, 10.0.2.15): scala.MatchError: namespace (of class > java.lang.String) > at > > org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:305) > at > > org.apache.spark.sql.json.JsonRDD$$anonfun$parseJson$1$$anonfun$apply$2.apply(JsonRDD.scala:303) > at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at scala.collection.Iterator$class.foreach(Iterator.scala:727) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) > at > > scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:172) > at > scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1157) > at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:853) > at org.apache.spark.rdd.RDD$$anonfun$18.apply(RDD.scala:851) > at > org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) > at > org.apache.spark.SparkContext$$anonfun$29.apply(SparkContext.scala:1350) > at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:56) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196) > at > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202) > at > > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at > scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696) > at scala.Option.foreach(Option.scala:236) > at > > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) > at akka.actor.ActorCell.invoke(ActorCell.scala:456) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) > at akka.dispatch.Mailbox.run(Mailbox.scala:219) > at > > akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) > at > scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > > > > > > > > -- > View this message in context: > http://apache-spark-user-list.1001560.n3.nabble.com/Issues-reading-in-Json-file-with-spark-sql-tp21879.html > Sent from the Apache Spark User List mailing list archive at Nabble.com. > > --------------------------------------------------------------------- > To unsubscribe, e-mail: user-unsubscr...@spark.apache.org > For additional commands, e-mail: user-h...@spark.apache.org > >