[
https://issues.apache.org/jira/browse/SPARK-17709?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15543921#comment-15543921
]
Ashish Shrowty edited comment on SPARK-17709 at 10/4/16 12:47 AM:
------------------------------------------------------------------
[~dkbiswal] Sorry Dilip .. I keep making typos .. the join was on companyid and
product id -
scala> df1.join(df2, Seq("companyid","productid"))
org.apache.spark.sql.AnalysisException: using columns ['companyid,'productid]
can not be resolved given input columns: [companyid, productid, avgprice,
avgitemcount] ;
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:40)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:58)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:174)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:58)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2589)
at org.apache.spark.sql.Dataset.join(Dataset.scala:641)
at org.apache.spark.sql.Dataset.join(Dataset.scala:614)
... 48 elided
Attached is explain outputs for df1 and df2 -
scala> df1.explain
== Physical Plan ==
*HashAggregate(keys=[companyid#53, productid#54], functions=[avg(price#56)])
+- Exchange hashpartitioning(companyid#53, productid#54, 200)
+- *HashAggregate(keys=[companyid#53, productid#54],
functions=[partial_avg(price#56)])
+- *Sample 0.0, 0.5, false, 2419324063718201506
+- *Project [companyid#53, productid#54, price#56]
+- *BatchedScan parquet
referencedata.testproduct[productid#54,price#56,companyid#53] Format:
ParquetFormat, InputPaths:
s3://com.birdzi.datalake.test/testtable/companyid=100,
s3://com.birdzi.datalake.test/testtable/co..., PushedFilters: [], ReadSchema:
struct<productid:int,price:double>
scala> df2.explain
== Physical Plan ==
*HashAggregate(keys=[companyid#53, productid#54],
functions=[avg(cast(itemcount#57 as bigint))])
+- Exchange hashpartitioning(companyid#53, productid#54, 200)
+- *HashAggregate(keys=[companyid#53, productid#54],
functions=[partial_avg(cast(itemcount#57 as bigint))])
+- *Sample 0.0, 0.5, false, -7492644014085475670
+- *Project [companyid#53, productid#54, itemcount#57]
+- *BatchedScan parquet
referencedata.testproduct[productid#54,itemcount#57,companyid#53] Format:
ParquetFormat, InputPaths:
s3://com.birdzi.datalake.test/testtable/companyid=100,
s3://com.birdzi.datalake.test/testtable/co..., PushedFilters: [], ReadSchema:
struct<productid:int,itemcount:int>
Also the table structure for reference -
scala> spark.sql("describe referencedata.testproduct").show
+--------------------+---------+-------+
| col_name|data_type|comment|
+--------------------+---------+-------+
| productid| int| null|
| name| string| null|
| price| double| null|
| itemcount| int| null|
| companyid| int| null|
|# Partition Infor...| | |
| # col_name|data_type|comment|
| companyid| int| null|
+--------------------+---------+-------+
was (Author: ashrowty):
[~dkbiswal] Sorry Dilip .. I keep making typos .. the join was on companyid and
product id -
scala> df1.join(df2, Seq("companyid","productid"))
org.apache.spark.sql.AnalysisException: using columns ['companyid,'productid]
can not be resolved given input columns: [companyid, productid, avgprice,
avgitemcount] ;
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:40)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:58)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:174)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:58)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2589)
at org.apache.spark.sql.Dataset.join(Dataset.scala:641)
at org.apache.spark.sql.Dataset.join(Dataset.scala:614)
... 48 elided
Attached is explain outputs for df1 and df2 -
scala> df1.explain
== Physical Plan ==
*HashAggregate(keys=[companyid#53, productid#54], functions=[avg(price#56)])
+- Exchange hashpartitioning(companyid#53, productid#54, 200)
+- *HashAggregate(keys=[companyid#53, productid#54],
functions=[partial_avg(price#56)])
+- *Sample 0.0, 0.5, false, 2419324063718201506
+- *Project [companyid#53, productid#54, price#56]
+- *BatchedScan parquet
referencedata.testproduct[productid#54,price#56,companyid#53] Format:
ParquetFormat, InputPaths:
s3://com.birdzi.datalake.test/testtable/companyid=100,
s3://com.birdzi.datalake.test/testtable/co..., PushedFilters: [], ReadSchema:
struct<productid:int,price:double>
scala> df2.explain
== Physical Plan ==
*HashAggregate(keys=[companyid#53, productid#54],
functions=[avg(cast(itemcount#57 as bigint))])
+- Exchange hashpartitioning(companyid#53, productid#54, 200)
+- *HashAggregate(keys=[companyid#53, productid#54],
functions=[partial_avg(cast(itemcount#57 as bigint))])
+- *Sample 0.0, 0.5, false, -7492644014085475670
+- *Project [companyid#53, productid#54, itemcount#57]
+- *BatchedScan parquet
referencedata.testproduct[productid#54,itemcount#57,companyid#53] Format:
ParquetFormat, InputPaths:
s3://com.birdzi.datalake.test/testtable/companyid=100,
s3://com.birdzi.datalake.test/testtable/co..., PushedFilters: [], ReadSchema:
struct<productid:int,itemcount:int>
> spark 2.0 join - column resolution error
> ----------------------------------------
>
> Key: SPARK-17709
> URL: https://issues.apache.org/jira/browse/SPARK-17709
> Project: Spark
> Issue Type: Bug
> Affects Versions: 2.0.0
> Reporter: Ashish Shrowty
> Labels: easyfix
>
> If I try to inner-join two dataframes which originated from the same initial
> dataframe that was loaded using spark.sql() call, it results in an error -
> // reading from Hive .. the data is stored in Parquet format in Amazon S3
> val d1 = spark.sql("select * from <hivetable>")
> val df1 = d1.groupBy("key1","key2")
> .agg(avg("totalprice").as("avgtotalprice"))
> val df2 = d1.groupBy("key1","key2")
> .agg(avg("itemcount").as("avgqty"))
> df1.join(df2, Seq("key1","key2")) gives error -
> org.apache.spark.sql.AnalysisException: using columns ['key1,'key2] can
> not be resolved given input columns: [key1, key2, avgtotalprice, avgqty];
> If the same Dataframe is initialized via spark.read.parquet(), the above code
> works. This same code above worked with Spark 1.6.2
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]