This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 915d8989c5 [feature](spark-load)Spark load supports string type data import (#11927) 915d8989c5 is described below commit 915d8989c5774638190d487d77c2079a422aea27 Author: jiafeng.zhang <zhang...@gmail.com> AuthorDate: Mon Aug 22 08:56:59 2022 +0800 [feature](spark-load)Spark load supports string type data import (#11927) --- be/src/olap/push_handler.cpp | 1 + .../apache/doris/load/loadv2/dpp/ColumnParser.java | 23 ++++++++++++++++++++++ .../org/apache/doris/load/loadv2/dpp/DppUtils.java | 6 ++++++ .../org/apache/doris/load/loadv2/dpp/SparkDpp.java | 12 +++++++++++ 4 files changed, 42 insertions(+) diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 7cae605904..e6b858015a 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -905,6 +905,7 @@ Status PushBrokerReader::fill_field_row(RowCursorCell* dst, const char* src, boo case OLAP_FIELD_TYPE_DOUBLE: case OLAP_FIELD_TYPE_CHAR: case OLAP_FIELD_TYPE_VARCHAR: + case OLAP_FIELD_TYPE_STRING: case OLAP_FIELD_TYPE_HLL: case OLAP_FIELD_TYPE_OBJECT: { dst->set_is_null(src_null); diff --git a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java index e72d471a9f..1425dd3bef 100644 --- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java +++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java @@ -29,6 +29,7 @@ import java.math.BigInteger; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; + // Parser to validate value for different type public abstract class ColumnParser implements Serializable { @@ -62,6 +63,9 @@ public abstract class ColumnParser implements Serializable { return new DateParser(); } else if (columnType.equalsIgnoreCase("DATETIME")) { return new DatetimeParser(); + } else if (columnType.equalsIgnoreCase("STRING") + || columnType.equalsIgnoreCase("TEXT")) { + return new StringTypeParser(etlColumn); } else if (columnType.equalsIgnoreCase("VARCHAR") || columnType.equalsIgnoreCase("CHAR") || columnType.equalsIgnoreCase("BITMAP") @@ -208,6 +212,25 @@ class StringParser extends ColumnParser { } } +class StringTypeParser extends ColumnParser { + + private EtlJobConfig.EtlColumn etlColumn; + + public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) { + this.etlColumn = etlColumn; + } + + @Override + public boolean parse(String value) { + try { + return value.getBytes("UTF-8").length <= DppUtils.STRING_LENGTH_LIMIT; + } catch (Exception e) { + throw new RuntimeException("string check failed ", e); + } + } +} + + class DecimalParser extends ColumnParser { public static int PRECISION = 27; diff --git a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java index c53e1ae087..9a252b38b7 100644 --- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java +++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java @@ -41,6 +41,8 @@ import java.util.zip.CRC32; public class DppUtils { public static final String BUCKET_ID = "__bucketId__"; + public static final int STRING_LENGTH_LIMIT = 1048576; + public static Class getClassFromDataType(DataType dataType) { if (dataType == null) { return null; @@ -94,6 +96,8 @@ public class DppUtils { case "HLL": case "CHAR": case "VARCHAR": + case "STRING": + case "TEXT": case "BITMAP": case "OBJECT": return String.class; @@ -142,6 +146,8 @@ public class DppUtils { break; case "CHAR": case "VARCHAR": + case "STRING": + case "TEXT": case "OBJECT": dataType = DataTypes.StringType; break; diff --git a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java index ab7e791a1c..5d951ad70b 100644 --- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java +++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java @@ -405,6 +405,18 @@ public final class SparkDpp implements java.io.Serializable { return false; } break; + case "STRING": + case "TEXT": + // TODO(zjf) padding string type + int strDataSize = 0; + if (srcValue != null && (strDataSize = srcValue.toString().getBytes(StandardCharsets.UTF_8).length) + > DppUtils.STRING_LENGTH_LIMIT) { + LOG.warn(String.format("The string type is limited to a maximum of %s bytes." + + " column_name:%s,input_str[%s],actual length:%s", + DppUtils.STRING_LENGTH_LIMIT, etlColumn.columnName, row.toString(), strDataSize)); + return false; + } + break; default: return true; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org