eldenmoon commented on code in PR #20556:
URL: https://github.com/apache/doris/pull/20556#discussion_r1226142589


##########
be/src/vec/data_types/data_type_array.cpp:
##########
@@ -188,77 +189,64 @@ std::string DataTypeArray::to_string(const IColumn& 
column, size_t row_num) cons
     return str;
 }
 
-bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& 
has_quota) {
-    StringRef element(rb.position(), 0);
-    has_quota = false;
-    if (rb.eof()) {
-        return false;
+Status DataTypeArray::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    if (json_value.type() != simdjson::ondemand::json_type::array) {
+        return Status::InvalidArgument("Parse json data failed, not array type 
'{}'",
+                                       json_value.type().take_value());
     }
-
-    // ltrim
-    while (!rb.eof() && isspace(*rb.position())) {
-        ++rb.position();
-        element.data = rb.position();
-    }
-
-    // parse string
-    if (*rb.position() == '"' || *rb.position() == '\'') {
-        const char str_sep = *rb.position();
-        size_t str_len = 1;
-        // search until next '"' or '\''
-        while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) {
-            ++str_len;
-        }
-        // invalid string
-        if (str_len >= rb.count()) {
-            rb.position() = rb.end();
-            return false;
-        }
-        has_quota = true;
-        rb.position() += str_len + 1;
-        element.size += str_len + 1;
-    }
-
-    // parse array element until array separator ',' or end ']'
-    while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || 
*rb.position() != ']')) {
-        // invalid elements such as ["123" 456,"789" 777]
-        // correct elements such as ["123"    ,"789"    ]
-        if (has_quota && !isspace(*rb.position())) {
-            return false;
+    simdjson::ondemand::array outer_array = json_value.get_array();
+    auto* array_column = assert_cast<ColumnArray*>(column);
+    auto& offsets = array_column->get_offsets();
+    IColumn& nested_column = array_column->get_data();
+    DCHECK(nested_column.is_nullable());
+    auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
+    bool is_string_nested = is_string(remove_nullable(nested));
+    size_t element_num = 0;
+    for (auto it = outer_array.begin(); it != outer_array.end(); ++it) {
+        Status st;
+        try {
+            if (is_complex_type(remove_nullable(nested))) {
+                simdjson::ondemand::value val;
+                auto error_code = (*it).get(val);
+                if (simdjson::SUCCESS != (*it).get(val)) {
+                    st = Status::InvalidArgument(
+                            "Parse json data failed, error code: {}, error "
+                            "info: {}",
+                            error_code, simdjson::error_message(error_code));
+                } else {
+                    st = nested->from_json(val, &nested_null_col);
+                }
+            } else {
+                std::string_view sv = 
simdjson::trim((*it).raw_json_token().value());
+                if (is_string_nested) {
+                    StringRef sr(sv.data(), sv.size());
+                    StringRef del("\"");
+                    sv = simd::VStringFunctions::trim(sr, del);
+                }
+                ReadBuffer nested_rb(const_cast<char*>(sv.data()), sv.size());
+                st = nested->from_string(nested_rb, &nested_column);

Review Comment:
   i don't think it's quite reasonable to call from_string here, `from_json` 
semantic is different from `from_string`



##########
be/src/vec/data_types/data_type_nullable.cpp:
##########
@@ -80,10 +81,27 @@ void DataTypeNullable::to_string(const IColumn& column, 
size_t row_num,
     }
 }
 
+Status DataTypeNullable::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    DCHECK(is_complex_type(nested_data_type));

Review Comment:
   what if it's not complex type, i think we should not DCHECK complex type here



##########
be/src/vec/data_types/data_type_array.cpp:
##########
@@ -188,77 +189,64 @@ std::string DataTypeArray::to_string(const IColumn& 
column, size_t row_num) cons
     return str;
 }
 
-bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& 
has_quota) {
-    StringRef element(rb.position(), 0);
-    has_quota = false;
-    if (rb.eof()) {
-        return false;
+Status DataTypeArray::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    if (json_value.type() != simdjson::ondemand::json_type::array) {
+        return Status::InvalidArgument("Parse json data failed, not array type 
'{}'",
+                                       json_value.type().take_value());
     }
-
-    // ltrim
-    while (!rb.eof() && isspace(*rb.position())) {
-        ++rb.position();
-        element.data = rb.position();
-    }
-
-    // parse string
-    if (*rb.position() == '"' || *rb.position() == '\'') {
-        const char str_sep = *rb.position();
-        size_t str_len = 1;
-        // search until next '"' or '\''
-        while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) {
-            ++str_len;
-        }
-        // invalid string
-        if (str_len >= rb.count()) {
-            rb.position() = rb.end();
-            return false;
-        }
-        has_quota = true;
-        rb.position() += str_len + 1;
-        element.size += str_len + 1;
-    }
-
-    // parse array element until array separator ',' or end ']'
-    while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || 
*rb.position() != ']')) {
-        // invalid elements such as ["123" 456,"789" 777]
-        // correct elements such as ["123"    ,"789"    ]
-        if (has_quota && !isspace(*rb.position())) {
-            return false;
+    simdjson::ondemand::array outer_array = json_value.get_array();
+    auto* array_column = assert_cast<ColumnArray*>(column);
+    auto& offsets = array_column->get_offsets();
+    IColumn& nested_column = array_column->get_data();
+    DCHECK(nested_column.is_nullable());
+    auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
+    bool is_string_nested = is_string(remove_nullable(nested));
+    size_t element_num = 0;
+    for (auto it = outer_array.begin(); it != outer_array.end(); ++it) {
+        Status st;
+        try {

Review Comment:
   why not `try catch`outof for loop? could iterate array throw simd execption?



##########
be/src/vec/data_types/data_type_nullable.cpp:
##########
@@ -80,10 +81,27 @@ void DataTypeNullable::to_string(const IColumn& column, 
size_t row_num,
     }
 }
 
+Status DataTypeNullable::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    DCHECK(is_complex_type(nested_data_type));

Review Comment:
   why dcheck complex in DataTypeNullable? 



##########
be/src/vec/data_types/data_type_array.cpp:
##########
@@ -188,77 +189,64 @@ std::string DataTypeArray::to_string(const IColumn& 
column, size_t row_num) cons
     return str;
 }
 
-bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& 
has_quota) {
-    StringRef element(rb.position(), 0);
-    has_quota = false;
-    if (rb.eof()) {
-        return false;
+Status DataTypeArray::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    if (json_value.type() != simdjson::ondemand::json_type::array) {
+        return Status::InvalidArgument("Parse json data failed, not array type 
'{}'",
+                                       json_value.type().take_value());
     }
-
-    // ltrim
-    while (!rb.eof() && isspace(*rb.position())) {
-        ++rb.position();
-        element.data = rb.position();
-    }
-
-    // parse string
-    if (*rb.position() == '"' || *rb.position() == '\'') {
-        const char str_sep = *rb.position();
-        size_t str_len = 1;
-        // search until next '"' or '\''
-        while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) {
-            ++str_len;
-        }
-        // invalid string
-        if (str_len >= rb.count()) {
-            rb.position() = rb.end();
-            return false;
-        }
-        has_quota = true;
-        rb.position() += str_len + 1;
-        element.size += str_len + 1;
-    }
-
-    // parse array element until array separator ',' or end ']'
-    while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || 
*rb.position() != ']')) {
-        // invalid elements such as ["123" 456,"789" 777]
-        // correct elements such as ["123"    ,"789"    ]
-        if (has_quota && !isspace(*rb.position())) {
-            return false;
+    simdjson::ondemand::array outer_array = json_value.get_array();
+    auto* array_column = assert_cast<ColumnArray*>(column);
+    auto& offsets = array_column->get_offsets();
+    IColumn& nested_column = array_column->get_data();
+    DCHECK(nested_column.is_nullable());
+    auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column);
+    bool is_string_nested = is_string(remove_nullable(nested));
+    size_t element_num = 0;
+    for (auto it = outer_array.begin(); it != outer_array.end(); ++it) {
+        Status st;
+        try {
+            if (is_complex_type(remove_nullable(nested))) {
+                simdjson::ondemand::value val;
+                auto error_code = (*it).get(val);
+                if (simdjson::SUCCESS != (*it).get(val)) {
+                    st = Status::InvalidArgument(
+                            "Parse json data failed, error code: {}, error "
+                            "info: {}",
+                            error_code, simdjson::error_message(error_code));
+                } else {
+                    st = nested->from_json(val, &nested_null_col);
+                }
+            } else {
+                std::string_view sv = 
simdjson::trim((*it).raw_json_token().value());
+                if (is_string_nested) {
+                    StringRef sr(sv.data(), sv.size());
+                    StringRef del("\"");
+                    sv = simd::VStringFunctions::trim(sr, del);
+                }
+                ReadBuffer nested_rb(const_cast<char*>(sv.data()), sv.size());
+                st = nested->from_string(nested_rb, &nested_column);

Review Comment:
   mix them could be miss leading



##########
be/src/vec/data_types/data_type_array.cpp:
##########
@@ -188,77 +189,64 @@ std::string DataTypeArray::to_string(const IColumn& 
column, size_t row_num) cons
     return str;
 }
 
-bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& 
has_quota) {
-    StringRef element(rb.position(), 0);
-    has_quota = false;
-    if (rb.eof()) {
-        return false;
+Status DataTypeArray::from_json(simdjson::ondemand::value& json_value, 
IColumn* column) const {
+    if (json_value.type() != simdjson::ondemand::json_type::array) {
+        return Status::InvalidArgument("Parse json data failed, not array type 
'{}'",
+                                       json_value.type().take_value());
     }
-
-    // ltrim
-    while (!rb.eof() && isspace(*rb.position())) {
-        ++rb.position();
-        element.data = rb.position();
-    }
-
-    // parse string
-    if (*rb.position() == '"' || *rb.position() == '\'') {
-        const char str_sep = *rb.position();
-        size_t str_len = 1;
-        // search until next '"' or '\''
-        while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) {
-            ++str_len;
-        }
-        // invalid string
-        if (str_len >= rb.count()) {
-            rb.position() = rb.end();
-            return false;
-        }
-        has_quota = true;
-        rb.position() += str_len + 1;
-        element.size += str_len + 1;
-    }
-
-    // parse array element until array separator ',' or end ']'
-    while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || 
*rb.position() != ']')) {
-        // invalid elements such as ["123" 456,"789" 777]
-        // correct elements such as ["123"    ,"789"    ]
-        if (has_quota && !isspace(*rb.position())) {
-            return false;
+    simdjson::ondemand::array outer_array = json_value.get_array();
+    auto* array_column = assert_cast<ColumnArray*>(column);
+    auto& offsets = array_column->get_offsets();
+    IColumn& nested_column = array_column->get_data();
+    DCHECK(nested_column.is_nullable());

Review Comment:
   what if nested is not nullable?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to