This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 915d8989c5 [feature](spark-load)Spark load supports string type data 
import (#11927)
915d8989c5 is described below

commit 915d8989c5774638190d487d77c2079a422aea27
Author: jiafeng.zhang <zhang...@gmail.com>
AuthorDate: Mon Aug 22 08:56:59 2022 +0800

    [feature](spark-load)Spark load supports string type data import (#11927)
---
 be/src/olap/push_handler.cpp                       |  1 +
 .../apache/doris/load/loadv2/dpp/ColumnParser.java | 23 ++++++++++++++++++++++
 .../org/apache/doris/load/loadv2/dpp/DppUtils.java |  6 ++++++
 .../org/apache/doris/load/loadv2/dpp/SparkDpp.java | 12 +++++++++++
 4 files changed, 42 insertions(+)

diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index 7cae605904..e6b858015a 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -905,6 +905,7 @@ Status PushBrokerReader::fill_field_row(RowCursorCell* dst, 
const char* src, boo
     case OLAP_FIELD_TYPE_DOUBLE:
     case OLAP_FIELD_TYPE_CHAR:
     case OLAP_FIELD_TYPE_VARCHAR:
+    case OLAP_FIELD_TYPE_STRING:
     case OLAP_FIELD_TYPE_HLL:
     case OLAP_FIELD_TYPE_OBJECT: {
         dst->set_is_null(src_null);
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
index e72d471a9f..1425dd3bef 100644
--- 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
+++ 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/ColumnParser.java
@@ -29,6 +29,7 @@ import java.math.BigInteger;
 import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeFormatterBuilder;
 
+
 // Parser to validate value for different type
 public abstract class ColumnParser implements Serializable {
 
@@ -62,6 +63,9 @@ public abstract class ColumnParser implements Serializable {
             return new DateParser();
         } else if (columnType.equalsIgnoreCase("DATETIME")) {
             return new DatetimeParser();
+        } else if (columnType.equalsIgnoreCase("STRING")
+                || columnType.equalsIgnoreCase("TEXT")) {
+            return new StringTypeParser(etlColumn);
         } else if (columnType.equalsIgnoreCase("VARCHAR")
                 || columnType.equalsIgnoreCase("CHAR")
                 || columnType.equalsIgnoreCase("BITMAP")
@@ -208,6 +212,25 @@ class StringParser extends ColumnParser {
     }
 }
 
+class StringTypeParser extends ColumnParser {
+
+    private EtlJobConfig.EtlColumn etlColumn;
+
+    public StringTypeParser(EtlJobConfig.EtlColumn etlColumn) {
+        this.etlColumn = etlColumn;
+    }
+
+    @Override
+    public boolean parse(String value) {
+        try {
+            return value.getBytes("UTF-8").length <= 
DppUtils.STRING_LENGTH_LIMIT;
+        } catch (Exception e) {
+            throw new RuntimeException("string check failed ", e);
+        }
+    }
+}
+
+
 class DecimalParser extends ColumnParser {
 
     public static int PRECISION = 27;
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
index c53e1ae087..9a252b38b7 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/DppUtils.java
@@ -41,6 +41,8 @@ import java.util.zip.CRC32;
 public class DppUtils {
     public static final String BUCKET_ID = "__bucketId__";
 
+    public static final int STRING_LENGTH_LIMIT = 1048576;
+
     public static Class getClassFromDataType(DataType dataType) {
         if (dataType == null) {
             return null;
@@ -94,6 +96,8 @@ public class DppUtils {
             case "HLL":
             case "CHAR":
             case "VARCHAR":
+            case "STRING":
+            case "TEXT":
             case "BITMAP":
             case "OBJECT":
                 return String.class;
@@ -142,6 +146,8 @@ public class DppUtils {
                 break;
             case "CHAR":
             case "VARCHAR":
+            case "STRING":
+            case "TEXT":
             case "OBJECT":
                 dataType = DataTypes.StringType;
                 break;
diff --git 
a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java 
b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
index ab7e791a1c..5d951ad70b 100644
--- a/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
+++ b/fe/spark-dpp/src/main/java/org/apache/doris/load/loadv2/dpp/SparkDpp.java
@@ -405,6 +405,18 @@ public final class SparkDpp implements 
java.io.Serializable {
                     return false;
                 }
                 break;
+            case "STRING":
+            case "TEXT":
+                // TODO(zjf) padding string type
+                int strDataSize = 0;
+                if (srcValue != null && (strDataSize = 
srcValue.toString().getBytes(StandardCharsets.UTF_8).length)
+                        > DppUtils.STRING_LENGTH_LIMIT) {
+                    LOG.warn(String.format("The string type is limited to a 
maximum of %s bytes."
+                                    + " column_name:%s,input_str[%s],actual 
length:%s",
+                            DppUtils.STRING_LENGTH_LIMIT, 
etlColumn.columnName, row.toString(), strDataSize));
+                    return false;
+                }
+                break;
             default:
                 return true;
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to