This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new c59189cd2ba [fix](build) Backport thirdparty updates for Arrow LZO
(#65191)
c59189cd2ba is described below
commit c59189cd2baaf2ddec54b02ffeed0d9d342a8291
Author: Gabriel <[email protected]>
AuthorDate: Fri Jul 3 14:55:14 2026 +0800
[fix](build) Backport thirdparty updates for Arrow LZO (#65191)
### What problem does this PR solve?
Issue Number: None
Related PR: #65046
Problem Summary: Branch 4.1 shares the rebuilt Doris thirdparty
artifacts with master. PR #65046 added Parquet LZO page decompression
support to the Arrow/Parquet thirdparty patch, which introduces lzo
symbols from libparquet.a. Branch 4.1 already builds lzo2 as a
thirdparty package, but its BE CMake thirdparty list did not link lzo2,
so builds using the shared updated libparquet.a can fail with unresolved
lzo symbols.
This backports the thirdparty-related changes from #65046: the Arrow LZO
patch, lzo2 linkage, FlatBuffers version update and include cleanup, and
Arrow include path setup during thirdparty build.
### Release note
None
### Check List (For Author)
- Test: Manual test
- `git diff --cached --check`
- `bash -n thirdparty/build-thirdparty.sh
thirdparty/download-thirdparty.sh thirdparty/vars.sh`
- Behavior changed: No
- Does this need documentation: No
---
be/cmake/thirdparty.cmake | 1 +
thirdparty/build-thirdparty.sh | 5 +-
thirdparty/download-thirdparty.sh | 2 +
thirdparty/patches/apache-arrow-17.0.0-lzo.patch | 84 ++++++++++++++++++++++++
thirdparty/vars.sh | 8 +--
5 files changed, 95 insertions(+), 5 deletions(-)
diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake
index 227f81411f1..8aa4ae73020 100644
--- a/be/cmake/thirdparty.cmake
+++ b/be/cmake/thirdparty.cmake
@@ -66,6 +66,7 @@ add_thirdparty(gmock)
add_thirdparty(snappy)
add_thirdparty(curl)
add_thirdparty(lz4)
+add_thirdparty(lzo2)
add_thirdparty(thrift)
add_thirdparty(thriftnb)
add_thirdparty(crc32c)
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index 50f882d0b72..c8b4afdb6a1 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -995,6 +995,7 @@ build_flatbuffers() {
"${BUILD_SYSTEM}" -j "${PARALLEL}"
cp flatc ../../../installed/bin/flatc
+ rm -rf ../../../installed/include/flatbuffers
cp -r ../include/flatbuffers ../../../installed/include/flatbuffers
cp libflatbuffers.a ../../../installed/lib/libflatbuffers.a
}
@@ -1084,7 +1085,9 @@ build_arrow() {
ldflags="-L${TP_LIB_DIR}"
fi
- LDFLAGS="${ldflags}" \
+ CPPFLAGS="-I${TP_INCLUDE_DIR}" \
+ CXXFLAGS="-I${TP_INCLUDE_DIR}" \
+ LDFLAGS="${ldflags}" \
"${CMAKE_CMD}" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
-G "${GENERATOR}" -DARROW_PARQUET=ON -DARROW_IPC=ON
-DARROW_BUILD_SHARED=OFF \
-DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON
-DARROW_USE_GLOG=ON \
diff --git a/thirdparty/download-thirdparty.sh
b/thirdparty/download-thirdparty.sh
index a61f520ea35..feb94de4bdd 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -455,6 +455,8 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
# std::string objects in RELRO, then crash while initializing them.
patch -p1
<"${TP_PATCH_DIR}/apache-arrow-17.0.0-status-inline-static-fix.patch"
+ # Add Parquet LZO page decompression support used by file scanner
v2.
+ patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-lzo.patch"
touch "${PATCHED_MARK}"
fi
cd -
diff --git a/thirdparty/patches/apache-arrow-17.0.0-lzo.patch
b/thirdparty/patches/apache-arrow-17.0.0-lzo.patch
new file mode 100644
index 00000000000..a983818413a
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-lzo.patch
@@ -0,0 +1,84 @@
+--- a/cpp/src/parquet/column_reader.cc
++++ b/cpp/src/parquet/column_reader.cc
+@@ -30,0 +31,2 @@
++
++#include <lzo/lzo1x.h>
+@@ -268,0 +269 @@
++ compression_codec_(codec),
+@@ -279 +282,7 @@
+- decompressor_ = GetCodec(codec);
++ if (compression_codec_ == Compression::LZO) {
++ if (lzo_init() != LZO_E_OK) {
++ throw ParquetException("Failed to initialize LZO codec");
++ }
++ } else {
++ decompressor_ = GetCodec(codec);
++ }
+@@ -315,0 +325 @@
++ Compression::type compression_codec_;
+@@ -585 +595 @@
+- if (decompressor_ == nullptr) {
++ if (decompressor_ == nullptr && compression_codec_ != Compression::LZO) {
+@@ -601,0 +612,61 @@
++ if (compression_codec_ == Compression::LZO) {
++ const uint8_t* input = page_buffer->data() + levels_byte_len;
++ const uint8_t* const input_end = page_buffer->data() + compressed_len;
++ uint8_t* output = decompression_buffer_->mutable_data() + levels_byte_len;
++ uint8_t* const output_end = decompression_buffer_->mutable_data() +
uncompressed_len;
++
++ auto load_big_endian_u32 = [](const uint8_t* data) {
++ return (static_cast<uint32_t>(data[0]) << 24) |
++ (static_cast<uint32_t>(data[1]) << 16) |
++ (static_cast<uint32_t>(data[2]) << 8) |
static_cast<uint32_t>(data[3]);
++ };
++
++ while (input < input_end) {
++ if (input_end - input < 4) {
++ throw ParquetException("LZO page decompression failed: truncated
large block length");
++ }
++
++ uint32_t large_block_uncompressed_len = load_big_endian_u32(input);
++ input += 4;
++ if (static_cast<size_t>(output_end - output) <
large_block_uncompressed_len) {
++ throw ParquetException("LZO page decompression failed: output buffer
too small");
++ }
++
++ while (large_block_uncompressed_len > 0) {
++ if (input_end - input < 4) {
++ throw ParquetException("LZO page decompression failed: truncated
small block length");
++ }
++
++ uint32_t small_block_compressed_len = load_big_endian_u32(input);
++ input += 4;
++ if (static_cast<size_t>(input_end - input) <
small_block_compressed_len) {
++ throw ParquetException("LZO page decompression failed: truncated
small block data");
++ }
++
++ auto small_block_uncompressed_len =
++ static_cast<lzo_uint>(large_block_uncompressed_len);
++ const int result =
++ lzo1x_decompress_safe(input,
static_cast<lzo_uint>(small_block_compressed_len),
++ output, &small_block_uncompressed_len,
nullptr);
++ if (result != LZO_E_OK) {
++ throw ParquetException("LZO page decompression failed, error: " +
++ std::to_string(result));
++ }
++ if (small_block_uncompressed_len > large_block_uncompressed_len) {
++ throw ParquetException("LZO page decompression failed: invalid
small block size");
++ }
++
++ input += small_block_compressed_len;
++ output += small_block_uncompressed_len;
++ large_block_uncompressed_len -= small_block_uncompressed_len;
++ }
++ }
++ if (output != output_end) {
++ throw ParquetException("Page didn't decompress to expected size,
expected: " +
++ std::to_string(uncompressed_len -
levels_byte_len) + ", but got:" +
++ std::to_string(output -
(decompression_buffer_->mutable_data() +
++ levels_byte_len)));
++ }
++
++ return decompression_buffer_;
++ }
++
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index 155a4f6c898..77ed4e32e35 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -242,10 +242,10 @@ BROTLI_SOURCE="brotli-1.0.9"
BROTLI_MD5SUM="c2274f0c7af8470ad514637c35bcee7d"
# flatbuffers
-FLATBUFFERS_DOWNLOAD="https://github.com/google/flatbuffers/archive/v2.0.0.tar.gz"
-FLATBUFFERS_NAME=flatbuffers-2.0.0.tar.gz
-FLATBUFFERS_SOURCE=flatbuffers-2.0.0
-FLATBUFFERS_MD5SUM="a27992324c3cbf86dd888268a23d17bd"
+FLATBUFFERS_DOWNLOAD="https://github.com/google/flatbuffers/archive/v23.5.26.tar.gz"
+FLATBUFFERS_NAME=flatbuffers-23.5.26.tar.gz
+FLATBUFFERS_SOURCE=flatbuffers-23.5.26
+FLATBUFFERS_MD5SUM="2ef00eaaa86ab5e9ad5eafe09c2e7b60"
# c-ares
CARES_DOWNLOAD="https://github.com/c-ares/c-ares/releases/download/cares-1_19_1/c-ares-1.19.1.tar.gz"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]