This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3c5a98e2caa [enhance](parquet) support decimal256 for parquet reader
(#41526)
3c5a98e2caa is described below
commit 3c5a98e2caa8d5fe9da39fcf738264c0cce51dc6
Author: Socrates <[email protected]>
AuthorDate: Wed Oct 9 19:09:33 2024 +0800
[enhance](parquet) support decimal256 for parquet reader (#41526)
## Proposed changes
Implemented reading parqeut files with decimal256 type
---
be/src/gutil/endian.h | 13 +++++++++++--
be/src/util/bit_util.h | 9 ++++++++-
.../exec/format/parquet/parquet_column_convert.cpp | 5 ++++-
.../vec/exec/format/parquet/parquet_column_convert.h | 19 ++++++++++++++++++-
be/test/util/bit_util_test.cpp | 19 ++++++++++++++++++-
.../hdfs_tvf/test_parquet_decimal256.parquet | Bin 0 -> 1320 bytes
.../data/external_table_p0/tvf/test_hdfs_tvf.out | 7 +++++++
.../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 ++++++++
8 files changed, 74 insertions(+), 6 deletions(-)
diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h
index 4bc04e1e303..f78480b3cf5 100644
--- a/be/src/gutil/endian.h
+++ b/be/src/gutil/endian.h
@@ -60,8 +60,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128
host_int) {
}
inline wide::UInt256 gbswap_256(wide::UInt256 host_int) {
- wide::UInt256 result{gbswap_64(host_int.items[3]),
gbswap_64(host_int.items[2]),
- gbswap_64(host_int.items[1]),
gbswap_64(host_int.items[0])};
+ wide::UInt256 result {gbswap_64(host_int.items[3]),
gbswap_64(host_int.items[2]),
+ gbswap_64(host_int.items[1]),
gbswap_64(host_int.items[0])};
return result;
}
@@ -136,6 +136,9 @@ public:
static unsigned __int128 FromHost128(unsigned __int128 x) { return x; }
static unsigned __int128 ToHost128(unsigned __int128 x) { return x; }
+ static wide::UInt256 FromHost256(wide::UInt256 x) { return x; }
+ static wide::UInt256 ToHost256(wide::UInt256 x) { return x; }
+
static bool IsLittleEndian() { return true; }
#elif defined IS_BIG_ENDIAN
@@ -149,6 +152,12 @@ public:
static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
+ static unsigned __int128 FromHost128(unsigned __int128 x) { return
gbswap_128(x); }
+ static unsigned __int128 ToHost128(unsigned __int128 x) { return
gbswap_128(x); }
+
+ static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); }
+ static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); }
+
static bool IsLittleEndian() { return false; }
#endif /* ENDIAN */
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index 44b391f44da..504b0b27428 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -20,6 +20,9 @@
#pragma once
+#include <type_traits>
+
+#include "vec/core/wide_integer.h"
#ifndef __APPLE__
#include <endian.h>
#endif
@@ -209,7 +212,11 @@ public:
template <typename T>
static T big_endian_to_host(T value) {
- if constexpr (std::is_same_v<T, __int128>) {
+ if constexpr (std::is_same_v<T, wide::Int256>) {
+ return BigEndian::ToHost256(value);
+ } else if constexpr (std::is_same_v<T, wide::UInt256>) {
+ return BigEndian::ToHost256(value);
+ } else if constexpr (std::is_same_v<T, __int128>) {
return BigEndian::ToHost128(value);
} else if constexpr (std::is_same_v<T, unsigned __int128>) {
return BigEndian::ToHost128(value);
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
index 2fb0afea82a..0a5ef2913dd 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
@@ -19,6 +19,7 @@
#include <cctz/time_zone.h>
+#include "runtime/define_primitive_type.h"
#include "vec/columns/column_nullable.h"
namespace doris::vectorized::parquet {
const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone();
@@ -27,7 +28,8 @@ const cctz::time_zone ConvertParams::utc0 =
cctz::utc_time_zone();
M(TYPE_DECIMALV2) \
M(TYPE_DECIMAL32) \
M(TYPE_DECIMAL64) \
- M(TYPE_DECIMAL128I)
+ M(TYPE_DECIMAL128I) \
+ M(TYPE_DECIMAL256)
bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) {
switch (type) {
@@ -50,6 +52,7 @@ bool
PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) {
case TYPE_DECIMAL32:
case TYPE_DECIMAL64:
case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
case TYPE_DECIMALV2:
return true;
default:
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h
b/be/src/vec/exec/format/parquet/parquet_column_convert.h
index 91b81121aa4..cf6f8aa13fa 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.h
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h
@@ -20,6 +20,7 @@
#include <gen_cpp/parquet_types.h>
#include "vec/core/types.h"
+#include "vec/core/wide_integer.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/exec/format/column_type_convert.h"
#include "vec/exec/format/format_common.h"
@@ -401,7 +402,23 @@ public:
M(13, int128_t) \
M(14, int128_t) \
M(15, int128_t) \
- M(16, int128_t)
+ M(16, int128_t) \
+ M(17, wide::Int256) \
+ M(18, wide::Int256) \
+ M(19, wide::Int256) \
+ M(20, wide::Int256) \
+ M(21, wide::Int256) \
+ M(22, wide::Int256) \
+ M(23, wide::Int256) \
+ M(24, wide::Int256) \
+ M(25, wide::Int256) \
+ M(26, wide::Int256) \
+ M(27, wide::Int256) \
+ M(28, wide::Int256) \
+ M(29, wide::Int256) \
+ M(30, wide::Int256) \
+ M(31, wide::Int256) \
+ M(32, wide::Int256)
switch (_type_length) {
APPLY_FOR_DECIMALS()
diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp
index 514daafa604..fd3bee01432 100644
--- a/be/test/util/bit_util_test.cpp
+++ b/be/test/util/bit_util_test.cpp
@@ -21,7 +21,6 @@
#include <gtest/gtest-test-part.h>
#include <boost/utility/binary.hpp>
-#include <memory>
#include "gtest/gtest_pred_impl.h"
@@ -48,4 +47,22 @@ TEST(BitUtil, Popcount) {
EXPECT_EQ(BitUtil::popcount_no_hw(0), 0);
}
+TEST(BitUtil, BigEndianToHost) {
+ uint16_t v16 = 0x1234;
+ uint32_t v32 = 0x12345678;
+ uint64_t v64 = 0x123456789abcdef0;
+ unsigned __int128 v128 = ((__int128)0x123456789abcdef0LL << 64) |
0x123456789abcdef0LL;
+ wide::UInt256 v256 =
+ wide::UInt256(0x123456789abcdef0) << 192 |
wide::UInt256(0x123456789abcdef0) << 128 |
+ wide::UInt256(0x123456789abcdef0) << 64 |
wide::UInt256(0x123456789abcdef0);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v16), 0x3412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v32), 0x78563412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v64), 0xf0debc9a78563412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v128),
+ ((__int128)0xf0debc9a78563412LL << 64) | 0xf0debc9a78563412LL);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v256),
+ wide::UInt256(0xf0debc9a78563412) << 192 |
wide::UInt256(0xf0debc9a78563412) << 128 |
+ wide::UInt256(0xf0debc9a78563412) << 64 |
wide::UInt256(0xf0debc9a78563412));
+}
+
} // namespace doris
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
new file mode 100644
index 00000000000..323ded32160
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
differ
diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
index 3f12b448581..e850e38a237 100644
--- a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
@@ -221,6 +221,13 @@
19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731
6150.38 refully final foxes across the dogged theodolites sleep slyly abou
20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3
13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly
regular accounts. silent, expr
+-- !parquet_decimal256 --
+1
99999999999999999999999999999999999999.99999999999999999999999999999999999999
+2
-99999999999999999999999999999999999999.99999999999999999999999999999999999999
+3 1E-38
+4 -1E-38
+5 0E-38
+
-- !orc --
1 goldenrod lavender spring chocolate lace Manufacturer#1
Brand#13 PROMO BURNISHED COPPER 7 JUMBO PKG 901.00 ly.
slyly ironi
2 blush thistle blue yellow saddle Manufacturer#1 Brand#13
LARGE BRUSHED BRASS 1 LG CASE 902.00 lar accounts amo
diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
index 8c4028bfefe..02bda4ec0dd 100644
--- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
@@ -108,6 +108,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker")
{
"hadoop.username" = "${hdfsUserName}",
"format" = "${format}") order by s_suppkey limit
20; """
+ // test parquet decimal256
+ uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"
+ format = "parquet"
+ qt_parquet_decimal256 """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "${format}") order by id; """
+
// test orc
uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_orc.snappy.orc"
format = "orc"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]