Hi all:I was using spark sql on hive for doing some statistics. While using
spark sql, I found that, sometimes, spark-sql got the incorrect results while
select. And if I executed the same select statement again, it returned the
different results which was correct. That is saying, sometime, with the same
input and the same select logic, the select result is different. That is so
weird!Is there any hints? Thanks.
val selectJoin =
s"""
| insert into table $tblLoginLogStatDay
| select '$statDate' as date, '$USER_TYPE' as user_type, a.platform,
a.channel, a.product_version,
| coalesce(c.total_user_90,0), a.total_user,
coalesce(b.one_time_startup, 0), a.total_startup, coalesce(f.dau,0),
coalesce(d.wau,0), coalesce(e.mau,0)
| from
| (select platform, channel, product_version, count(distinct
$userType) as total_user, sum(login_cnt) as total_startup
| from $tblNewTable
| group by platform, channel, product_version) a
|
| left outer join
|
| (select platform, channel, product_version, count(distinct
$userType) as one_time_startup
| from $tblNewTable
| where login_cnt = 1
| group by platform, channel, product_version) b
| on a. platform = b.platform and a.channel = b.channel and
a.product_version = b.product_version
|
| left outer join
|
| (select platform, channel, product_version, count(distinct
$userType) as total_user_90
| from $tblNewTable90
| group by platform, channel, product_version) c
| on a.platform = c.platform and a.channel = c.channel and
a.product_version = c.product_version
|
| left outer join
|
| (select platform, channel, product_version, count(distinct
$userType) as wau
| from $tblNewTable90
| where from_unixtimestamp_to_date(last_time) >=
date_sub('$statDate', 6)
| group by platform, channel, product_version) d
| on a.platform = d.platform and a.channel = d.channel and
a.product_version = d.product_version
|
| left outer join
|
| (select platform, channel, product_version, count(distinct
$userType) as mau
| from $tblNewTable90
| where from_unixtimestamp_to_date(last_time) >=
date_sub('$statDate', 29)
| group by platform, channel, product_version) e
| on a.platform = e.platform and a.channel = e.channel and
a.product_version = e.product_version
|
| left outer join
|
| (select platform, channel, product_version, count(distinct
$userType) as dau
| from $tblNewTable90
| where from_unixtimestamp_to_date(last_time) = '$statDate'
| group by platform, channel, product_version) f
| on a.platform = f.platform and a.channel = f.channel and
a.product_version = f.product_version
""".stripMargin
spark version: 1.2.0
hive version: 0.10.0-cdh4.2.1