like this: val sc = new SparkContext(new SparkConf().setAppName("SLA Filter")) val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext._ val suffix = args(0) sqlContext.parquetFile("/user/hive/warehouse/xxx_parquet.db/xxxxxx001_" + suffix).registerAsTable("xxxxxx001") sqlContext.parquetFile("/user/hive/warehouse/xxx_parquet.db/xxxxxx002_" + suffix).registerAsTable("xxxxxx002") ... ... var xxxxxx001 = sql("select some_id, some_type, some_time from xxxxxx001").map(line => (line(0), (line(1).toString, line(2).toString.substring(0, 19)) ) ) var xxxxxx002 = sql("select some_id, some_type, some_time from xxxxxx002").map(line => (line(0), (line(1).toString, line(2).toString.substring(0, 19)) ) ) ... ...
var all = xxxxxx001 union xxxxxx002 ... union ... all..groupByKey.filter( kv => FilterSLA.filterSLA(kv._2.toSeq) ).saveAsTextFile(xxx) filterSLA will turn the input Seq[(String, String)] to Map, then check somethinkg like if map contains type1 and type2 and then if timestamp_type1 - timestamp_type2 > 2days thanks -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Spark-1-0-1-SQL-on-160-G-parquet-file-snappy-compressed-made-by-cloudera-impala-23-core-and-60G-mem-d-tp10254p10268.html Sent from the Apache Spark User List mailing list archive at Nabble.com.