Shawn Zhang created SPARK-18006:
-----------------------------------
Summary: When union, spark SQL didn't complain about schema
mismatch
Key: SPARK-18006
URL: https://issues.apache.org/jira/browse/SPARK-18006
Project: Spark
Issue Type: Bug
Components: Java API
Affects Versions: 2.0.1
Reporter: Shawn Zhang
Priority: Minor
When union two Dataset<Row>, spark will check they have same number of columns.
But if the order of column is different, strange result will be generated.
The output of the following code shows that column have being switched by Spark.
================= Code =============
package test;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import audit_spark.SparkConfig;
public class SchemaBug {
public static class User {
public User(long uid, long dateline) {
this.uid = uid;
this.dateline = dateline;
}
long uid;
long dateline;
public long getUid() {
return uid;
}
public void setUid(long uid) {
this.uid = uid;
}
public long getDateline() {
return dateline;
}
public void setDateline(long dateline) {
this.dateline = dateline;
}
}
public static void main(String[] args) {
SparkSession sparkSession = SparkSession
.builder()
.appName("test")
.config("spark.sql.warehouse.dir", "file:///")
.getOrCreate();
StructType userSchema2 = new StructType(new StructField[]{
new StructField("uid", DataTypes.LongType,
false, Metadata.empty()),
new StructField("dateline", DataTypes.LongType,
false, Metadata.empty()),
});
List userList = new ArrayList();
userList.add(new User(1, System.currentTimeMillis()));
userList.add(new User(2, System.currentTimeMillis()));
Dataset<Row> ds1 =
SparkConfig.sparkSession.createDataFrame(userList, User.class);
Dataset<Row> ds2 = SparkConfig.sparkSession.createDataFrame(new
ArrayList(), userSchema2);
ds2.union(ds1).show();
}
}
=========== Program Output ===============
| uid|dateline|
|1476867071496| 1|
|1476867071496| 2|
=========== Expected Output ===============
| dateline |uid|
|1476867071496| 1|
|1476867071496| 2|
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]