The issue is now resolved.

One of the csv files had an incorrect record at the end.

On Fri, Feb 27, 2015 at 4:24 PM, anamika gupta <anamika.guo...@gmail.com>
wrote:

> I have three tables with the following schema:
>
> case class* date_d*(WID: Int, CALENDAR_DATE: java.sql.Timestamp,
> DATE_STRING: String, DAY_OF_WEEK: String, DAY_OF_MONTH: Int, DAY_OF_YEAR:
> Int, END_OF_MONTH_FLAG: String, YEARWEEK: Int, CALENDAR_MONTH: String,
> MONTH_NUM: Int, YEARMONTH: Int, QUARTER: Int, YEAR: Int)
>
>
>
> case class* interval_f*(ORG_ID: Int, CHANNEL_WID: Int, SDP_WID: Int,
> MEAS_WID: Int, DATE_WID: Int, TIME_WID: Int, VALIDATION_STATUS_CD: Int,
> VAL_FAIL_CD:Int, INTERVAL_FLAG_CD: Int, CHANGE_METHOD_WID:Int,
> SOURCE_LAST_UPD_TIME: java.sql.Timestamp, INTERVAL_END_TIME:
> java.sql.Timestamp, LOCKED: String, EXT_VERSION_TIME: java.sql.Timestamp,
> INTERVAL_VALUE: Double, INSERT_TIME: java.sql.Timestamp, LAST_UPD_TIME:
> java.sql.Timestamp)
>
>
>
> class *sdp_d*( WID :Option[Int], BATCH_ID :Option[Int], SRC_ID
> :Option[String], ORG_ID :Option[Int], CLASS_WID :Option[Int], DESC_TEXT
> :Option[String], PREMISE_WID :Option[Int], FEED_LOC :Option[String],
> GPS_LAT :Option[Double], GPS_LONG :Option[Double], PULSE_OUTPUT_BLOCK
> :Option[String], UDC_ID :Option[String], UNIVERSAL_ID :Option[String],
> IS_VIRTUAL_FLG :Option[String], SEAL_INFO :Option[String], ACCESS_INFO
> :Option[String], ALT_ACCESS_INFO :Option[String], LOC_INFO :Option[String],
> ALT_LOC_INFO :Option[String], TYPE :Option[String], SUB_TYPE
> :Option[String], TIMEZONE_ID :Option[Int], GIS_ID :Option[String],
> BILLED_UPTO_TIME :Option[java.sql.Timestamp], POWER_STATUS :Option[String],
> LOAD_STATUS :Option[String], BILLING_HOLD_STATUS :Option[String],
> INSERT_TIME :Option[java.sql.Timestamp], LAST_UPD_TIME
> :Option[java.sql.Timestamp]) extends Product{
>
>     @throws(classOf[IndexOutOfBoundsException])
>     override def productElement(n: Int) = n match
>     {
>         case 0 => WID; case 1 => BATCH_ID; case 2 => SRC_ID; case 3 =>
> ORG_ID; case 4 => CLASS_WID; case 5 => DESC_TEXT; case 6 => PREMISE_WID;
> case 7 => FEED_LOC; case 8 => GPS_LAT; case 9 => GPS_LONG; case 10 =>
> PULSE_OUTPUT_BLOCK; case 11 => UDC_ID; case 12 => UNIVERSAL_ID; case 13 =>
> IS_VIRTUAL_FLG; case 14 => SEAL_INFO; case 15 => ACCESS_INFO; case 16 =>
> ALT_ACCESS_INFO; case 17 => LOC_INFO; case 18 => ALT_LOC_INFO; case 19 =>
> TYPE; case 20 => SUB_TYPE; case 21 => TIMEZONE_ID; case 22 => GIS_ID; case
> 23 => BILLED_UPTO_TIME; case 24 => POWER_STATUS; case 25 => LOAD_STATUS;
> case 26 => BILLING_HOLD_STATUS; case 27 => INSERT_TIME; case 28 =>
> LAST_UPD_TIME; case _ => throw new IndexOutOfBoundsException(n.toString())
>     }
>
>     override def productArity: Int = 29; override def canEqual(that: Any):
> Boolean = that.isInstanceOf[sdp_d]
> }
>
>
>
> Non-join queries work fine:
>
> *val q1 = sqlContext.sql("""SELECT YEAR, DAY_OF_YEAR, MAX(WID), MIN(WID),
> COUNT(*) FROM date_d GROUP BY YEAR, DAY_OF_YEAR ORDER BY YEAR,
> DAY_OF_YEAR""")*
>
> res4: Array[org.apache.spark.sql.Row] =
> Array([2014,305,20141101,20141101,1], [2014,306,20141102,20141102,1],
> [2014,307,20141103,20141103,1], [2014,308,20141104,20141104,1],
> [2014,309,20141105,20141105,1], [2014,310,20141106,20141106,1],
> [2014,311,20141107,20141107,1], [2014,312,20141108,20141108,1],
> [2014,313,20141109,20141109,1], [2014,314,20141110,20141110,1],
> [2014,315,20141111,20141111,1], [2014,316,20141112,20141112,1],
> [2014,317,20141113,20141113,1], [2014,318,20141114,20141114,1],
> [2014,319,20141115,20141115,1], [2014,320,20141116,20141116,1],
> [2014,321,20141117,20141117,1], [2014,322,20141118,20141118,1],
> [2014,323,20141119,20141119,1], [2014,324,20141120,20141120,1],
> [2014,325,20141121,20141121,1], [2014,326,20141122,20141122,1],
> [2014,327,20141123,20141123,1], [2014,328,20141...
>
>
>
> But the join queries throw this error:*
> java.lang.ArrayIndexOutOfBoundsException*
>
> *scala> val q = sqlContext.sql("""select * from date_d dd join interval_f
> intf on intf.DATE_WID = dd.WID Where intf.DATE_WID >= 20141101 AND
> intf.DATE_WID <= 20141110""")*
>
> q: org.apache.spark.sql.SchemaRDD =
> SchemaRDD[38] at RDD at SchemaRDD.scala:103
> == Query Plan ==
> == Physical Plan ==
> Project
> [WID#0,CALENDAR_DATE#1,DATE_STRING#2,DAY_OF_WEEK#3,DAY_OF_MONTH#4,DAY_OF_YEAR#5,END_OF_MONTH_FLAG#6,YEARWEEK#7,CALENDAR_MONTH#8,MONTH_NUM#9,YEARMONTH#10,QUARTER#11,YEAR#12,ORG_ID#13,CHANNEL_WID#14,SDP_WID#15,MEAS_WID#16,DATE_WID#17,TIME_WID#18,VALIDATION_STATUS_CD#19,VAL_FAIL_CD#20,INTERVAL_FLAG_CD#21,CHANGE_METHOD_WID#22,SOURCE_LAST_UPD_TIME#23,INTERVAL_END_TIME#24,LOCKED#25,EXT_VERSION_TIME#26,INTERVAL_VALUE#27,INSERT_TIME#28,LAST_UPD_TIME#29]
>  ShuffledHashJoin [WID#0], [DATE_WID#17], BuildRight
>   Exchange (HashPartitioning [WID#0], 200)
>    InMemoryColumnarTableScan
> [WID#0,CALENDAR_DATE#1,DATE_STRING#2,DAY_OF_WEEK#3,DAY_OF_MONTH#4,DAY_OF_YEAR#5,END_OF_MONTH_FLA...
>
>
> *scala> q.take(5).foreach(println)*
>
> 15/02/27 15:50:26 INFO SparkContext: Starting job: runJob at
> basicOperators.scala:136
> 15/02/27 15:50:26 INFO DAGScheduler: Registering RDD 46 (mapPartitions at
> Exchange.scala:48)
> 15/02/27 15:50:26 INFO FileInputFormat: Total input paths to process : 1
> 15/02/27 15:50:26 INFO DAGScheduler: Registering RDD 42 (mapPartitions at
> Exchange.scala:48)
> 15/02/27 15:50:26 INFO DAGScheduler: Got job 2 (runJob at
> basicOperators.scala:136) with 1 output partitions (allowLocal=false)
> 15/02/27 15:50:26 INFO DAGScheduler: Final stage: Stage 5(runJob at
> basicOperators.scala:136)
> 15/02/27 15:50:26 INFO DAGScheduler: Parents of final stage: List(Stage 6,
> Stage 7)
> 15/02/27 15:50:26 INFO DAGScheduler: Missing parents: List(Stage 6, Stage
> 7)
> 15/02/27 15:50:26 INFO DAGScheduler: Submitting Stage 6
> (MapPartitionsRDD[46] at mapPartitions at Exchange.scala:48), which has no
> missing parents
> 15/02/27 15:50:26 INFO MemoryStore: ensureFreeSpace(48744) called with
> curMem=1042656, maxMem=278302556
> 15/02/27 15:50:26 INFO MemoryStore: Block broadcast_7 stored as values in
> memory (estimated size 47.6 KB, free 264.4 MB)
> 15/02/27 15:50:26 INFO DAGScheduler: Submitting 2 missing tasks from Stage
> 6 (MapPartitionsRDD[46] at mapPartitions at Exchange.scala:48)
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Adding task set 6.0 with 2 tasks
> 15/02/27 15:50:26 INFO TaskSetManager: Starting task 0.0 in stage 6.0 (TID
> 594, localhost, ANY, 1197 bytes)
> 15/02/27 15:50:26 INFO TaskSetManager: Starting task 1.0 in stage 6.0 (TID
> 595, localhost, ANY, 1197 bytes)
> 15/02/27 15:50:26 INFO Executor: Running task 0.0 in stage 6.0 (TID 594)
> 15/02/27 15:50:26 INFO Executor: Running task 1.0 in stage 6.0 (TID 595)
> 15/02/27 15:50:26 INFO DAGScheduler: Submitting Stage 7
> (MapPartitionsRDD[42] at mapPartitions at Exchange.scala:48), which has no
> missing parents
> 15/02/27 15:50:26 INFO MemoryStore: ensureFreeSpace(50184) called with
> curMem=1091400, maxMem=278302556
> 15/02/27 15:50:26 INFO MemoryStore: Block broadcast_8 stored as values in
> memory (estimated size 49.0 KB, free 264.3 MB)
> 15/02/27 15:50:26 INFO DAGScheduler: Submitting 2 missing tasks from Stage
> 7 (MapPartitionsRDD[42] at mapPartitions at Exchange.scala:48)
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Adding task set 7.0 with 2 tasks
> 15/02/27 15:50:26 INFO TaskSetManager: Starting task 0.0 in stage 7.0 (TID
> 596, localhost, ANY, 1201 bytes)
> 15/02/27 15:50:26 INFO TaskSetManager: Starting task 1.0 in stage 7.0 (TID
> 597, localhost, ANY, 1201 bytes)
> 15/02/27 15:50:26 INFO Executor: Running task 0.0 in stage 7.0 (TID 596)
> 15/02/27 15:50:26 INFO Executor: Running task 1.0 in stage 7.0 (TID 597)
> 15/02/27 15:50:26 INFO BlockManager: Found block rdd_19_0 locally
> 15/02/27 15:50:26 INFO BlockManager: Found block rdd_19_1 locally
> 15/02/27 15:50:26 INFO CacheManager: Partition rdd_21_0 not found,
> computing it
> 15/02/27 15:50:26 INFO CacheManager: Partition rdd_21_1 not found,
> computing it
> 15/02/27 15:50:26 INFO HadoopRDD: Input split:
> hdfs://CDH-Master-1.cdhcluster/user/spark/Interval_f.csv:0+279831
> 15/02/27 15:50:26 INFO HadoopRDD: Input split:
> hdfs://CDH-Master-1.cdhcluster/user/spark/Interval_f.csv:279831+279831
> 15/02/27 15:50:26 INFO Executor: Finished task 1.0 in stage 6.0 (TID 595).
> 2134 bytes result sent to driver
> 15/02/27 15:50:26 INFO Executor: Finished task 0.0 in stage 6.0 (TID 594).
> 2134 bytes result sent to driver
> 15/02/27 15:50:26 INFO TaskSetManager: Finished task 1.0 in stage 6.0 (TID
> 595) in 66 ms on localhost (1/2)
> 15/02/27 15:50:26 INFO TaskSetManager: Finished task 0.0 in stage 6.0 (TID
> 594) in 68 ms on localhost (2/2)
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Removed TaskSet 6.0, whose tasks
> have all completed, from pool
> 15/02/27 15:50:26 INFO DAGScheduler: Stage 6 (mapPartitions at
> Exchange.scala:48) finished in 0.068 s
> 15/02/27 15:50:26 INFO DAGScheduler: looking for newly runnable stages
> 15/02/27 15:50:26 INFO DAGScheduler: running: Set(Stage 7)
> 15/02/27 15:50:26 INFO DAGScheduler: waiting: Set(Stage 5)
> 15/02/27 15:50:26 INFO DAGScheduler: failed: Set()
> 15/02/27 15:50:26 INFO DAGScheduler: Missing parents for Stage 5:
> List(Stage 7)
> 15/02/27 15:50:26 INFO BlockManager: Removing broadcast 7
> 15/02/27 15:50:26 INFO BlockManager: Removing block broadcast_7
> 15/02/27 15:50:26 INFO MemoryStore: Block broadcast_7 of size 48744
> dropped from memory (free 277209716)
> 15/02/27 15:50:26 INFO ContextCleaner: Cleaned broadcast 7
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [ORG_ID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@409ea4d1,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [CHANNEL_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6f56b67b,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [SDP_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@25e67e58,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [MEAS_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@3f70d6d8,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [DATE_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@9291f72,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [TIME_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6acf7a10,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [VALIDATION_STATUS_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5b56e738,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [VAL_FAIL_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@26663c61,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [INTERVAL_FLAG_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@20617f9,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [CHANGE_METHOD_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5e0fdd78,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [SOURCE_LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@63952186,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INTERVAL_END_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@53cc177d,
> ratio: 1.0
> 15/02/27 15:50:26 INFO StringColumnBuilder: Compressor for [LOCKED]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7c1a3a85,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [EXT_VERSION_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@1df8316a,
> ratio: 1.0
> 15/02/27 15:50:26 INFO DoubleColumnBuilder: Compressor for
> [INTERVAL_VALUE]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@11743585,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INSERT_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6465b7b6,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@52004138,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [ORG_ID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6bcc5ad1,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [CHANNEL_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@355e86a1,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [SDP_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@50d8cf66,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [MEAS_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@4a185b01,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [DATE_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@4fec4a8,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [TIME_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7220f427,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [VALIDATION_STATUS_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@517d66fa,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [VAL_FAIL_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7ff3d0e1,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [INTERVAL_FLAG_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6016a567,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [CHANGE_METHOD_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6ec53e79,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [SOURCE_LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5788b2bf,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INTERVAL_END_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@4f8f467e,
> ratio: 1.0
> 15/02/27 15:50:26 INFO StringColumnBuilder: Compressor for [LOCKED]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@475d2300,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [EXT_VERSION_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@33a4f99a,
> ratio: 1.0
> 15/02/27 15:50:26 INFO DoubleColumnBuilder: Compressor for
> [INTERVAL_VALUE]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@13ff07f3,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INSERT_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@72a6e257,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@2a55f88f,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [ORG_ID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5541f4b4,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [CHANNEL_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5d288126,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [SDP_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@e371592,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [MEAS_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@42692b88,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [DATE_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6a90fc8,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [TIME_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@454b16e2,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [VALIDATION_STATUS_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@44cb72f8,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for [VAL_FAIL_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@8e91b11,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [INTERVAL_FLAG_CD]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7feffda8,
> ratio: 1.0
> 15/02/27 15:50:26 INFO IntColumnBuilder: Compressor for
> [CHANGE_METHOD_WID]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@64f66236,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [SOURCE_LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@6ba9fb02,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INTERVAL_END_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@649e7786,
> ratio: 1.0
> 15/02/27 15:50:26 INFO StringColumnBuilder: Compressor for [LOCKED]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@5fb93205,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [EXT_VERSION_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7783175b,
> ratio: 1.0
> 15/02/27 15:50:26 INFO DoubleColumnBuilder: Compressor for
> [INTERVAL_VALUE]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@3f7294a9,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [INSERT_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@7b7e03c9,
> ratio: 1.0
> 15/02/27 15:50:26 INFO TimestampColumnBuilder: Compressor for
> [LAST_UPD_TIME]:
> org.apache.spark.sql.columnar.compression.PassThrough$Encoder@e2ac076,
> ratio: 1.0
> 15/02/27 15:50:26 INFO MemoryStore: ensureFreeSpace(180264) called with
> curMem=1092840, maxMem=278302556
> 15/02/27 15:50:26 INFO MemoryStore: Block rdd_21_0 stored as values in
> memory (estimated size 176.0 KB, free 264.2 MB)
> 15/02/27 15:50:26 INFO BlockManagerInfo: Added rdd_21_0 in memory on
> CDH-Master-1.cdhcluster:45530 (size: 176.0 KB, free: 265.2 MB)
> 15/02/27 15:50:26 INFO BlockManagerMaster: Updated info of block rdd_21_0
> 15/02/27 15:50:26 ERROR Executor: Exception in task 1.0 in stage 7.0 (TID
> 597)
> *java.lang.ArrayIndexOutOfBoundsException: 10*
>     at
> $line17.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>     at
> $line17.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$1.next(Iterator.scala:853)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:62)
>     at
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:50)
>     at
> org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:236)
>     at
> org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:163)
>     at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:227)
>     at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>     at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>     at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>     at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
>     at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
>     at org.apache.spark.scheduler.Task.run(Task.scala:54)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:180)
>     at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>     at java.lang.Thread.run(Thread.java:745)
> 15/02/27 15:50:26 WARN TaskSetManager: Lost task 1.0 in stage 7.0 (TID
> 597, localhost): java.lang.ArrayIndexOutOfBoundsException: 10
>
> $line17.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>
> $line17.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>         scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>         scala.collection.Iterator$$anon$1.next(Iterator.scala:853)
>         scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:62)
>
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:50)
>
> org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:236)
>
> org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:163)
>         org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:227)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
>
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
>         org.apache.spark.scheduler.Task.run(Task.scala:54)
>
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:180)
>
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>         java.lang.Thread.run(Thread.java:745)
> 15/02/27 15:50:26 ERROR TaskSetManager: Task 1 in stage 7.0 failed 1
> times; aborting job
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Cancelling stage 7
> 15/02/27 15:50:26 INFO Executor: Executor is trying to kill task 0.0 in
> stage 7.0 (TID 596)
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Stage 7 was cancelled
> 15/02/27 15:50:26 INFO DAGScheduler: Failed to run runJob at
> basicOperators.scala:136
> 15/02/27 15:50:26 INFO Executor: Executor killed task 0.0 in stage 7.0
> (TID 596)
> 15/02/27 15:50:26 WARN TaskSetManager: Lost task 0.0 in stage 7.0 (TID
> 596, localhost): TaskKilled (killed intentionally)
> 15/02/27 15:50:26 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks
> have all completed, from pool
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 1
> in stage 7.0 failed 1 times, most recent failure: Lost task 1.0 in stage
> 7.0 (TID 597, localhost): java.lang.ArrayIndexOutOfBoundsException: 10
>         $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>         $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:30)
>         scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>         scala.collection.Iterator$$anon$1.next(Iterator.scala:853)
>         scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:62)
>
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryColumnarTableScan.scala:50)
>
> org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:236)
>
> org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:163)
>         org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:70)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:227)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>         org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>         org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
>
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
>         org.apache.spark.scheduler.Task.run(Task.scala:54)
>
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:180)
>
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>         java.lang.Thread.run(Thread.java:745)
> Driver stacktrace:
>     at org.apache.spark.scheduler.DAGScheduler.org
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173)
>     at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>     at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
>     at scala.Option.foreach(Option.scala:236)
>     at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:688)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1391)
>     at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
>     at akka.actor.ActorCell.invoke(ActorCell.scala:456)
>     at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
>     at akka.dispatch.Mailbox.run(Mailbox.scala:219)
>     at
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
>     at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>     at
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>     at
> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>     at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>
> *I am unable to comprehend the error log on my own. Please help.*
>
>

Reply via email to