sounds like with ORC transactional table this happens When I create that table as ORC but non transactional it works!
Dr Mich Talebzadeh LinkedIn * https://www.linkedin.com/profile/view?id=AAEAAAAWh2gBxianrbJd6zP6AcPCCdOABUrV8Pw <https://www.linkedin.com/profile/view?id=AAEAAAAWh2gBxianrbJd6zP6AcPCCdOABUrV8Pw>* http://talebzadehmich.wordpress.com On 21 March 2016 at 17:53, Eugene Koifman <ekoif...@hortonworks.com> wrote: > The system thinks t2 is an Acid table but the files on disk don’t follow > the convention acid system would expect. > Perhaps Xuefu Zhang would know more on Spark/Aicd integration. > > From: Mich Talebzadeh <mich.talebza...@gmail.com> > Reply-To: "user@hive.apache.org" <user@hive.apache.org> > Date: Monday, March 21, 2016 at 9:39 AM > To: "user @spark" <u...@spark.apache.org>, user <user@hive.apache.org> > Subject: Error selecting from a Hive ORC table in Spark-sql > > Hi, > > Do we know the cause of this error when selecting from an Hive ORC table > > spark-sql> > *select * from t2; *16/03/21 16:38:33 ERROR SparkSQLDriver: Failed in > [select * from t2] > java.lang.RuntimeException: serious problem > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1021) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1048) > at > org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1921) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:909) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at org.apache.spark.rdd.RDD.collect(RDD.scala:908) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:177) > at > org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:587) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:63) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:308) > at > org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:226) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at > org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) > at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) > Caused by: java.util.concurrent.ExecutionException: > java.lang.NumberFormatException: For input string: "0000039_0000" > at > java.util.concurrent.FutureTask$Sync.innerGet(FutureTask.java:252) > at java.util.concurrent.FutureTask.get(FutureTask.java:111) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:998) > ... 43 more > Caused by: java.lang.NumberFormatException: For input string: > "0000039_0000" > at > java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) > at java.lang.Long.parseLong(Long.java:441) > at java.lang.Long.parseLong(Long.java:483) > at > org.apache.hadoop.hive.ql.io.AcidUtils.parseDelta(AcidUtils.java:310) > at > org.apache.hadoop.hive.ql.io.AcidUtils.getAcidState(AcidUtils.java:379) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:634) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:620) > at > java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334) > at java.util.concurrent.FutureTask.run(FutureTask.java:166) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:724) > java.lang.RuntimeException: serious problem > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1021) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1048) > at > org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1921) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:909) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at org.apache.spark.rdd.RDD.collect(RDD.scala:908) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:177) > at > org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:587) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:63) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:308) > at > org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:226) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at > org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) > at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) > Caused by: java.util.concurrent.ExecutionException: > java.lang.NumberFormatException: For input string: "0000039_0000" > at > java.util.concurrent.FutureTask$Sync.innerGet(FutureTask.java:252) > at java.util.concurrent.FutureTask.get(FutureTask.java:111) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:998) > ... 43 more > Caused by: java.lang.NumberFormatException: For input string: > "0000039_0000" > at > java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) > at java.lang.Long.parseLong(Long.java:441) > at java.lang.Long.parseLong(Long.java:483) > at > org.apache.hadoop.hive.ql.io.AcidUtils.parseDelta(AcidUtils.java:310) > at > org.apache.hadoop.hive.ql.io.AcidUtils.getAcidState(AcidUtils.java:379) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:634) > at > org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:620) > at > java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334) > at java.util.concurrent.FutureTask.run(FutureTask.java:166) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:724) > > > > Dr Mich Talebzadeh > > > > LinkedIn * > https://www.linkedin.com/profile/view?id=AAEAAAAWh2gBxianrbJd6zP6AcPCCdOABUrV8Pw > <https://www.linkedin.com/profile/view?id=AAEAAAAWh2gBxianrbJd6zP6AcPCCdOABUrV8Pw>* > > > > http://talebzadehmich.wordpress.com > > >