github-actions[bot] commented on code in PR #38277:
URL: https://github.com/apache/doris/pull/38277#discussion_r1689024588
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -17,10 +17,12 @@
#pragma once
+#include <gen_cpp/parquet_types.h>
Review Comment:
warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error]
```cpp
#include <gen_cpp/parquet_types.h>
^
```
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -156,4 +158,303 @@
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> appBuildHash)
+ : application(std::move(application)),
+ version(std::move(version)),
+ appBuildHash(std::move(appBuildHash)) {}
+
+ bool operator==(const ParsedVersion& other) const {
+ return application == other.application && version == other.version &&
+ appBuildHash == other.appBuildHash;
+ }
+
+ bool operator!=(const ParsedVersion& other) const { return !(*this ==
other); }
+
+ size_t hash() const {
+ std::hash<std::string> hasher;
+ return hasher(application) ^ (version ? hasher(*version) : 0) ^
+ (appBuildHash ? hasher(*appBuildHash) : 0);
+ }
+
+ std::string toString() const {
+ return "ParsedVersion(application=" + application +
+ ", semver=" + (version ? *version : "null") +
+ ", appBuildHash=" + (appBuildHash ? *appBuildHash : "null") +
")";
+ }
+
+public:
Review Comment:
warning: redundant access specifier has the same accessibility as the
previous access specifier [readability-redundant-access-specifiers]
```suggestion
```
<details>
<summary>Additional context</summary>
**be/src/vec/exec/format/parquet/parquet_common.h:166:** previously declared
here
```cpp
public:
^
```
</details>
##########
be/src/vec/exec/format/parquet/parquet_pred_cmp.h:
##########
@@ -142,24 +141,65 @@
break;
FOR_REINTERPRET_TYPES(DISPATCH)
#undef DISPATCH
+ case TYPE_FLOAT:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::FLOAT) return
false;
+ min_value = *reinterpret_cast<const
CppType*>(encoded_min.data());
+ max_value = *reinterpret_cast<const
CppType*>(encoded_max.data());
+ if (std::isnan(min_value) || std::isnan(max_value)) {
+ return false;
+ }
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0
values would be skipped
+ if (std::signbit(min_value) == 0 && min_value == 0.0f) {
+ min_value = -0.0f;
+ }
+ if (std::signbit(max_value) != 0 && max_value == -0.0f) {
+ max_value = 0.0f;
Review Comment:
warning: floating point literal has suffix 'f', which is not uppercase
[readability-uppercase-literal-suffix]
```suggestion
max_value = 0.0F;
```
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -156,4 +158,303 @@
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> appBuildHash)
+ : application(std::move(application)),
+ version(std::move(version)),
+ appBuildHash(std::move(appBuildHash)) {}
+
+ bool operator==(const ParsedVersion& other) const {
+ return application == other.application && version == other.version &&
+ appBuildHash == other.appBuildHash;
+ }
+
+ bool operator!=(const ParsedVersion& other) const { return !(*this ==
other); }
+
+ size_t hash() const {
+ std::hash<std::string> hasher;
+ return hasher(application) ^ (version ? hasher(*version) : 0) ^
+ (appBuildHash ? hasher(*appBuildHash) : 0);
+ }
+
+ std::string toString() const {
+ return "ParsedVersion(application=" + application +
+ ", semver=" + (version ? *version : "null") +
+ ", appBuildHash=" + (appBuildHash ? *appBuildHash : "null") +
")";
+ }
+
+public:
+ std::string application;
+ std::optional<std::string> version;
+ std::optional<std::string> appBuildHash;
+};
+
+class VersionParser {
+public:
+ static Status parse(const std::string& createdBy,
+ std::unique_ptr<ParsedVersion>* parsedVersion) {
+ static const std::string FORMAT =
+
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+ static const std::regex PATTERN(FORMAT);
+
+ std::smatch matcher;
+ if (!std::regex_match(createdBy, matcher, PATTERN)) {
+ return Status::InternalError(fmt::format(
+ "Could not parse created_by: {}, using format: {}",
createdBy, FORMAT));
+ }
+
+ std::string application = matcher[1].str();
+ if (application.empty()) {
+ return Status::InternalError("application cannot be null or
empty");
+ }
+ std::optional<std::string> semver = matcher[2].str().empty()
+ ? std::nullopt
+ :
std::optional<std::string>(matcher[2].str());
+ std::optional<std::string> appBuildHash =
+ matcher[3].str().empty() ? std::nullopt
+ :
std::optional<std::string>(matcher[3].str());
+ *parsedVersion = std::make_unique<ParsedVersion>(application, semver,
appBuildHash);
+ return Status::OK();
+ }
+};
+
+class SemanticVersion {
+public:
+ SemanticVersion(int major, int minor, int patch)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(false),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+ SemanticVersion(int major, int minor, int patch, bool has_unknown)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(has_unknown),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+#endif
+
+ SemanticVersion(int major, int minor, int patch,
std::optional<std::string> unknown,
+ std::optional<std::string> pre, std::optional<std::string>
build_info)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(unknown.has_value() && !unknown.value().empty()),
+ _unknown(std::move(unknown)),
+ _pre(pre.has_value() ?
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+ : std::nullopt),
+ _build_info(std::move(build_info)) {}
+
+ static Status parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>* semantic_version) {
+ static const std::regex pattern(
+ R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+ std::smatch match;
+
+ if (!std::regex_match(version, match, pattern)) {
+ return Status::InternalError(version + " does not match format");
+ }
+
+ int major = std::stoi(match[1].str());
+ int minor = std::stoi(match[2].str());
+ int patch = std::stoi(match[3].str());
+ std::optional<std::string> unknown =
+ match[4].str().empty() ? std::nullopt :
std::optional<std::string>(match[4].str());
+ std::optional<std::string> prerelease =
+ match[5].str().empty() ? std::nullopt :
std::optional<std::string>(match[5].str());
+ std::optional<std::string> build_info =
+ match[6].str().empty() ? std::nullopt :
std::optional<std::string>(match[6].str());
+ if (major < 0 || minor < 0 || patch < 0) {
+ return Status::InternalError("major({}), minor({}), and patch({})
must all be >= 0",
+ major, minor, patch);
+ }
+ *semantic_version = std::make_unique<SemanticVersion>(major, minor,
patch, unknown,
+ prerelease,
build_info);
+ return Status::OK();
+ }
+
+ int compareTo(const SemanticVersion& other) const {
+ if (int cmp = compareIntegers(_major, other._major); cmp != 0) return
cmp;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (int cmp = compareIntegers(_major, other._major); cmp != 0) {
return cmp;
}
```
##########
be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp:
##########
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
Review Comment:
warning: nested namespaces can be concatenated
[modernize-concat-nested-namespaces]
```suggestion
namespace doris::vectorized {
```
be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp:132:
```diff
- } // namespace vectorized
- } // namespace doris
+ } // namespace doris
```
##########
be/test/vec/exec/parquet/parquet_statistics_test.cpp:
##########
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_pred_cmp.h"
+
+namespace doris {
+namespace vectorized {
Review Comment:
warning: nested namespaces can be concatenated
[modernize-concat-nested-namespaces]
```suggestion
namespace doris::vectorized {
```
be/test/vec/exec/parquet/parquet_statistics_test.cpp:-1:
```diff
+ }
```
##########
be/test/vec/exec/parquet/parquet_version_test.cpp:
##########
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
Review Comment:
warning: nested namespaces can be concatenated
[modernize-concat-nested-namespaces]
```suggestion
namespace doris::vectorized {
```
be/test/vec/exec/parquet/parquet_version_test.cpp:219:
```diff
- } // namespace vectorized
- } // namespace doris
+ } // namespace doris
```
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -156,4 +158,303 @@
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> appBuildHash)
+ : application(std::move(application)),
+ version(std::move(version)),
+ appBuildHash(std::move(appBuildHash)) {}
+
+ bool operator==(const ParsedVersion& other) const {
+ return application == other.application && version == other.version &&
+ appBuildHash == other.appBuildHash;
+ }
+
+ bool operator!=(const ParsedVersion& other) const { return !(*this ==
other); }
+
+ size_t hash() const {
+ std::hash<std::string> hasher;
+ return hasher(application) ^ (version ? hasher(*version) : 0) ^
+ (appBuildHash ? hasher(*appBuildHash) : 0);
+ }
+
+ std::string toString() const {
+ return "ParsedVersion(application=" + application +
+ ", semver=" + (version ? *version : "null") +
+ ", appBuildHash=" + (appBuildHash ? *appBuildHash : "null") +
")";
+ }
+
+public:
+ std::string application;
+ std::optional<std::string> version;
+ std::optional<std::string> appBuildHash;
+};
+
+class VersionParser {
+public:
+ static Status parse(const std::string& createdBy,
+ std::unique_ptr<ParsedVersion>* parsedVersion) {
+ static const std::string FORMAT =
+
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+ static const std::regex PATTERN(FORMAT);
+
+ std::smatch matcher;
+ if (!std::regex_match(createdBy, matcher, PATTERN)) {
+ return Status::InternalError(fmt::format(
+ "Could not parse created_by: {}, using format: {}",
createdBy, FORMAT));
+ }
+
+ std::string application = matcher[1].str();
+ if (application.empty()) {
+ return Status::InternalError("application cannot be null or
empty");
+ }
+ std::optional<std::string> semver = matcher[2].str().empty()
+ ? std::nullopt
+ :
std::optional<std::string>(matcher[2].str());
+ std::optional<std::string> appBuildHash =
+ matcher[3].str().empty() ? std::nullopt
+ :
std::optional<std::string>(matcher[3].str());
+ *parsedVersion = std::make_unique<ParsedVersion>(application, semver,
appBuildHash);
+ return Status::OK();
+ }
+};
+
+class SemanticVersion {
+public:
+ SemanticVersion(int major, int minor, int patch)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(false),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+ SemanticVersion(int major, int minor, int patch, bool has_unknown)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(has_unknown),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+#endif
+
+ SemanticVersion(int major, int minor, int patch,
std::optional<std::string> unknown,
+ std::optional<std::string> pre, std::optional<std::string>
build_info)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(unknown.has_value() && !unknown.value().empty()),
+ _unknown(std::move(unknown)),
+ _pre(pre.has_value() ?
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+ : std::nullopt),
+ _build_info(std::move(build_info)) {}
+
+ static Status parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>* semantic_version) {
+ static const std::regex pattern(
+ R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+ std::smatch match;
+
+ if (!std::regex_match(version, match, pattern)) {
+ return Status::InternalError(version + " does not match format");
+ }
+
+ int major = std::stoi(match[1].str());
+ int minor = std::stoi(match[2].str());
+ int patch = std::stoi(match[3].str());
+ std::optional<std::string> unknown =
+ match[4].str().empty() ? std::nullopt :
std::optional<std::string>(match[4].str());
+ std::optional<std::string> prerelease =
+ match[5].str().empty() ? std::nullopt :
std::optional<std::string>(match[5].str());
+ std::optional<std::string> build_info =
+ match[6].str().empty() ? std::nullopt :
std::optional<std::string>(match[6].str());
+ if (major < 0 || minor < 0 || patch < 0) {
+ return Status::InternalError("major({}), minor({}), and patch({})
must all be >= 0",
+ major, minor, patch);
+ }
+ *semantic_version = std::make_unique<SemanticVersion>(major, minor,
patch, unknown,
+ prerelease,
build_info);
+ return Status::OK();
+ }
+
+ int compareTo(const SemanticVersion& other) const {
+ if (int cmp = compareIntegers(_major, other._major); cmp != 0) return
cmp;
+ if (int cmp = compareIntegers(_minor, other._minor); cmp != 0) return
cmp;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (int cmp = compareIntegers(_minor, other._minor); cmp != 0) {
return cmp;
}
```
##########
be/src/vec/exec/format/parquet/parquet_pred_cmp.h:
##########
@@ -142,24 +141,65 @@ class ParquetPredicate {
break;
FOR_REINTERPRET_TYPES(DISPATCH)
#undef DISPATCH
+ case TYPE_FLOAT:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::FLOAT) return
false;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (col_schema->physical_type != tparquet::Type::FLOAT) {
return false;
}
```
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -156,4 +158,303 @@
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> appBuildHash)
+ : application(std::move(application)),
+ version(std::move(version)),
+ appBuildHash(std::move(appBuildHash)) {}
+
+ bool operator==(const ParsedVersion& other) const {
+ return application == other.application && version == other.version &&
+ appBuildHash == other.appBuildHash;
+ }
+
+ bool operator!=(const ParsedVersion& other) const { return !(*this ==
other); }
+
+ size_t hash() const {
+ std::hash<std::string> hasher;
+ return hasher(application) ^ (version ? hasher(*version) : 0) ^
+ (appBuildHash ? hasher(*appBuildHash) : 0);
+ }
+
+ std::string toString() const {
+ return "ParsedVersion(application=" + application +
+ ", semver=" + (version ? *version : "null") +
+ ", appBuildHash=" + (appBuildHash ? *appBuildHash : "null") +
")";
+ }
+
+public:
+ std::string application;
+ std::optional<std::string> version;
+ std::optional<std::string> appBuildHash;
+};
+
+class VersionParser {
+public:
+ static Status parse(const std::string& createdBy,
+ std::unique_ptr<ParsedVersion>* parsedVersion) {
+ static const std::string FORMAT =
+
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+ static const std::regex PATTERN(FORMAT);
+
+ std::smatch matcher;
+ if (!std::regex_match(createdBy, matcher, PATTERN)) {
+ return Status::InternalError(fmt::format(
+ "Could not parse created_by: {}, using format: {}",
createdBy, FORMAT));
+ }
+
+ std::string application = matcher[1].str();
+ if (application.empty()) {
+ return Status::InternalError("application cannot be null or
empty");
+ }
+ std::optional<std::string> semver = matcher[2].str().empty()
+ ? std::nullopt
+ :
std::optional<std::string>(matcher[2].str());
+ std::optional<std::string> appBuildHash =
+ matcher[3].str().empty() ? std::nullopt
+ :
std::optional<std::string>(matcher[3].str());
+ *parsedVersion = std::make_unique<ParsedVersion>(application, semver,
appBuildHash);
+ return Status::OK();
+ }
+};
+
+class SemanticVersion {
+public:
+ SemanticVersion(int major, int minor, int patch)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(false),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+ SemanticVersion(int major, int minor, int patch, bool has_unknown)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(has_unknown),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+#endif
+
+ SemanticVersion(int major, int minor, int patch,
std::optional<std::string> unknown,
+ std::optional<std::string> pre, std::optional<std::string>
build_info)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(unknown.has_value() && !unknown.value().empty()),
+ _unknown(std::move(unknown)),
+ _pre(pre.has_value() ?
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+ : std::nullopt),
+ _build_info(std::move(build_info)) {}
+
+ static Status parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>* semantic_version) {
+ static const std::regex pattern(
+ R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+ std::smatch match;
+
+ if (!std::regex_match(version, match, pattern)) {
+ return Status::InternalError(version + " does not match format");
+ }
+
+ int major = std::stoi(match[1].str());
+ int minor = std::stoi(match[2].str());
+ int patch = std::stoi(match[3].str());
+ std::optional<std::string> unknown =
+ match[4].str().empty() ? std::nullopt :
std::optional<std::string>(match[4].str());
+ std::optional<std::string> prerelease =
+ match[5].str().empty() ? std::nullopt :
std::optional<std::string>(match[5].str());
+ std::optional<std::string> build_info =
+ match[6].str().empty() ? std::nullopt :
std::optional<std::string>(match[6].str());
+ if (major < 0 || minor < 0 || patch < 0) {
+ return Status::InternalError("major({}), minor({}), and patch({})
must all be >= 0",
+ major, minor, patch);
+ }
+ *semantic_version = std::make_unique<SemanticVersion>(major, minor,
patch, unknown,
+ prerelease,
build_info);
+ return Status::OK();
+ }
+
+ int compareTo(const SemanticVersion& other) const {
+ if (int cmp = compareIntegers(_major, other._major); cmp != 0) return
cmp;
+ if (int cmp = compareIntegers(_minor, other._minor); cmp != 0) return
cmp;
+ if (int cmp = compareIntegers(_patch, other._patch); cmp != 0) return
cmp;
+ if (int cmp = compareBooleans(other._prerelease, _prerelease); cmp !=
0) return cmp;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (int cmp = compareBooleans(other._prerelease, _prerelease); cmp
!= 0) { return cmp;
}
```
##########
be/src/vec/exec/format/parquet/parquet_pred_cmp.h:
##########
@@ -142,24 +141,65 @@
break;
FOR_REINTERPRET_TYPES(DISPATCH)
#undef DISPATCH
+ case TYPE_FLOAT:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::FLOAT) return
false;
+ min_value = *reinterpret_cast<const
CppType*>(encoded_min.data());
+ max_value = *reinterpret_cast<const
CppType*>(encoded_max.data());
+ if (std::isnan(min_value) || std::isnan(max_value)) {
+ return false;
+ }
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0
values would be skipped
+ if (std::signbit(min_value) == 0 && min_value == 0.0f) {
+ min_value = -0.0f;
Review Comment:
warning: floating point literal has suffix 'f', which is not uppercase
[readability-uppercase-literal-suffix]
```suggestion
min_value = -0.0F;
```
##########
be/test/vec/exec/parquet/parquet_statistics_test.cpp:
##########
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
Review Comment:
warning: 'gtest/gtest.h' file not found [clang-diagnostic-error]
```cpp
#include <gtest/gtest.h>
^
```
##########
be/src/vec/exec/format/parquet/parquet_common.h:
##########
@@ -156,4 +158,303 @@
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> appBuildHash)
+ : application(std::move(application)),
+ version(std::move(version)),
+ appBuildHash(std::move(appBuildHash)) {}
+
+ bool operator==(const ParsedVersion& other) const {
+ return application == other.application && version == other.version &&
+ appBuildHash == other.appBuildHash;
+ }
+
+ bool operator!=(const ParsedVersion& other) const { return !(*this ==
other); }
+
+ size_t hash() const {
+ std::hash<std::string> hasher;
+ return hasher(application) ^ (version ? hasher(*version) : 0) ^
+ (appBuildHash ? hasher(*appBuildHash) : 0);
+ }
+
+ std::string toString() const {
+ return "ParsedVersion(application=" + application +
+ ", semver=" + (version ? *version : "null") +
+ ", appBuildHash=" + (appBuildHash ? *appBuildHash : "null") +
")";
+ }
+
+public:
+ std::string application;
+ std::optional<std::string> version;
+ std::optional<std::string> appBuildHash;
+};
+
+class VersionParser {
+public:
+ static Status parse(const std::string& createdBy,
+ std::unique_ptr<ParsedVersion>* parsedVersion) {
+ static const std::string FORMAT =
+
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+ static const std::regex PATTERN(FORMAT);
+
+ std::smatch matcher;
+ if (!std::regex_match(createdBy, matcher, PATTERN)) {
+ return Status::InternalError(fmt::format(
+ "Could not parse created_by: {}, using format: {}",
createdBy, FORMAT));
+ }
+
+ std::string application = matcher[1].str();
+ if (application.empty()) {
+ return Status::InternalError("application cannot be null or
empty");
+ }
+ std::optional<std::string> semver = matcher[2].str().empty()
+ ? std::nullopt
+ :
std::optional<std::string>(matcher[2].str());
+ std::optional<std::string> appBuildHash =
+ matcher[3].str().empty() ? std::nullopt
+ :
std::optional<std::string>(matcher[3].str());
+ *parsedVersion = std::make_unique<ParsedVersion>(application, semver,
appBuildHash);
+ return Status::OK();
+ }
+};
+
+class SemanticVersion {
+public:
+ SemanticVersion(int major, int minor, int patch)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(false),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+ SemanticVersion(int major, int minor, int patch, bool has_unknown)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(has_unknown),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+#endif
+
+ SemanticVersion(int major, int minor, int patch,
std::optional<std::string> unknown,
+ std::optional<std::string> pre, std::optional<std::string>
build_info)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(unknown.has_value() && !unknown.value().empty()),
+ _unknown(std::move(unknown)),
+ _pre(pre.has_value() ?
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+ : std::nullopt),
+ _build_info(std::move(build_info)) {}
+
+ static Status parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>* semantic_version) {
+ static const std::regex pattern(
+ R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+ std::smatch match;
+
+ if (!std::regex_match(version, match, pattern)) {
+ return Status::InternalError(version + " does not match format");
+ }
+
+ int major = std::stoi(match[1].str());
+ int minor = std::stoi(match[2].str());
+ int patch = std::stoi(match[3].str());
+ std::optional<std::string> unknown =
+ match[4].str().empty() ? std::nullopt :
std::optional<std::string>(match[4].str());
+ std::optional<std::string> prerelease =
+ match[5].str().empty() ? std::nullopt :
std::optional<std::string>(match[5].str());
+ std::optional<std::string> build_info =
+ match[6].str().empty() ? std::nullopt :
std::optional<std::string>(match[6].str());
+ if (major < 0 || minor < 0 || patch < 0) {
+ return Status::InternalError("major({}), minor({}), and patch({})
must all be >= 0",
+ major, minor, patch);
+ }
+ *semantic_version = std::make_unique<SemanticVersion>(major, minor,
patch, unknown,
+ prerelease,
build_info);
+ return Status::OK();
+ }
+
+ int compareTo(const SemanticVersion& other) const {
+ if (int cmp = compareIntegers(_major, other._major); cmp != 0) return
cmp;
+ if (int cmp = compareIntegers(_minor, other._minor); cmp != 0) return
cmp;
+ if (int cmp = compareIntegers(_patch, other._patch); cmp != 0) return
cmp;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (int cmp = compareIntegers(_patch, other._patch); cmp != 0) {
return cmp;
}
```
##########
be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp:
##########
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
Review Comment:
warning: 'gtest/gtest.h' file not found [clang-diagnostic-error]
```cpp
#include <gtest/gtest.h>
^
```
##########
be/test/vec/exec/parquet/parquet_statistics_test.cpp:
##########
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_pred_cmp.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetStatisticsTest : public testing::Test {
+public:
+ ParquetStatisticsTest() {}
+};
+
+TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
Review Comment:
warning: function 'TEST_F' exceeds recommended size/complexity thresholds
[readability-function-size]
```cpp
TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
^
```
<details>
<summary>Additional context</summary>
**be/test/vec/exec/parquet/parquet_statistics_test.cpp:30:** 121 lines
including whitespace and comments (threshold 80)
```cpp
TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
^
```
</details>
##########
be/src/vec/exec/format/parquet/parquet_pred_cmp.h:
##########
@@ -142,24 +141,65 @@
break;
FOR_REINTERPRET_TYPES(DISPATCH)
#undef DISPATCH
+ case TYPE_FLOAT:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::FLOAT) return
false;
+ min_value = *reinterpret_cast<const
CppType*>(encoded_min.data());
+ max_value = *reinterpret_cast<const
CppType*>(encoded_max.data());
+ if (std::isnan(min_value) || std::isnan(max_value)) {
+ return false;
+ }
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0
values would be skipped
+ if (std::signbit(min_value) == 0 && min_value == 0.0f) {
+ min_value = -0.0f;
+ }
+ if (std::signbit(max_value) != 0 && max_value == -0.0f) {
+ max_value = 0.0f;
+ }
+ break;
+ } else {
+ return false;
+ }
+ case TYPE_DOUBLE:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::DOUBLE)
return false;
Review Comment:
warning: statement should be inside braces
[readability-braces-around-statements]
```suggestion
if (col_schema->physical_type != tparquet::Type::DOUBLE) {
return false;
}
```
##########
be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp:
##########
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetCorruptStatisticsTest : public testing::Test {
+public:
+ ParquetCorruptStatisticsTest() {}
Review Comment:
warning: use '= default' to define a trivial default constructor
[modernize-use-equals-default]
```suggestion
ParquetCorruptStatisticsTest() = default;
```
##########
be/test/vec/exec/parquet/parquet_statistics_test.cpp:
##########
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_pred_cmp.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetStatisticsTest : public testing::Test {
+public:
+ ParquetStatisticsTest() {}
Review Comment:
warning: use '= default' to define a trivial default constructor
[modernize-use-equals-default]
```suggestion
ParquetStatisticsTest() = default;
```
##########
be/test/vec/exec/parquet/parquet_version_test.cpp:
##########
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetVersionTest : public testing::Test {
+public:
+ ParquetVersionTest() {}
Review Comment:
warning: use '= default' to define a trivial default constructor
[modernize-use-equals-default]
```suggestion
ParquetVersionTest() = default;
```
##########
be/test/vec/exec/parquet/parquet_version_test.cpp:
##########
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
Review Comment:
warning: 'gtest/gtest.h' file not found [clang-diagnostic-error]
```cpp
#include <gtest/gtest.h>
^
```
##########
be/test/vec/exec/parquet/parquet_version_test.cpp:
##########
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetVersionTest : public testing::Test {
+public:
+ ParquetVersionTest() {}
+};
+
+TEST_F(ParquetVersionTest, test_version_parser) {
Review Comment:
warning: function 'TEST_F' exceeds recommended size/complexity thresholds
[readability-function-size]
```cpp
TEST_F(ParquetVersionTest, test_version_parser) {
^
```
<details>
<summary>Additional context</summary>
**be/test/vec/exec/parquet/parquet_version_test.cpp:30:** 91 lines including
whitespace and comments (threshold 80)
```cpp
TEST_F(ParquetVersionTest, test_version_parser) {
^
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]