This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 5993dbd9148 branch-4.1: [chore](thirdparty) Integrate paimon-cpp into
thirdparty build system #60296 (#61369)
5993dbd9148 is described below
commit 5993dbd91487837ac89ac2b14c99901ac7057c37
Author: Chenjunwei <[email protected]>
AuthorDate: Tue Mar 17 09:18:42 2026 +0800
branch-4.1: [chore](thirdparty) Integrate paimon-cpp into thirdparty build
system #60296 (#61369)
Cherry-pick #60296 to branch-4.1
### What problem does this PR solve?
- **Related PR**: #60296
Integrate paimon-cpp into thirdparty build system.
### Cherry-pick commit
- `2658f896441` - [chore](thirdparty) Integrate paimon-cpp into
thirdparty build system (#60296)
---------
Co-authored-by: Socrates <[email protected]>
---
.github/workflows/build-thirdparty.yml | 45 ++-
thirdparty/build-thirdparty.sh | 215 ++++++++++++-
thirdparty/download-thirdparty.sh | 98 ++++++
thirdparty/paimon-cpp-cache.cmake | 136 +++++++++
.../patches/apache-arrow-17.0.0-paimon.patch | 224 ++++++++++++++
.../paimon-cpp-buildutils-static-deps.patch | 331 +++++++++++++++++++++
thirdparty/vars.sh | 18 +-
7 files changed, 1049 insertions(+), 18 deletions(-)
diff --git a/.github/workflows/build-thirdparty.yml
b/.github/workflows/build-thirdparty.yml
index 6e7a888f416..cc707cf516b 100644
--- a/.github/workflows/build-thirdparty.yml
+++ b/.github/workflows/build-thirdparty.yml
@@ -65,16 +65,17 @@ jobs:
run: |
git clone -b v7 https://github.com/easimon/maximize-build-space
- - name: Maximize build space
- uses: ./maximize-build-space
- with:
- root-reserve-mb: 4096
- swap-size-mb: 8192
- remove-dotnet: 'true'
- remove-android: 'true'
- remove-haskell: 'true'
- remove-codeql: 'true'
- remove-docker-images: 'true'
+ #- name: Maximize build space
+ # uses: ./maximize-build-space
+ # with:
+ # root-reserve-mb: 4096
+ # temp-reserve-mb: 4096
+ # swap-size-mb: 8192
+ # remove-dotnet: 'true'
+ # remove-android: 'true'
+ # remove-haskell: 'true'
+ # remove-codeql: 'true'
+ # remove-docker-images: 'true'
- name: Checkout ${{ github.ref }}
uses: actions/checkout@v4
@@ -135,8 +136,18 @@ jobs:
export CMAKE_POLICY_VERSION_MINIMUM=3.10
export CUSTOM_CMAKE="/usr/local/bin/cmake"
+ #export TMPDIR=/home/runner/work/doris/doris/.tmp
+ #export TMP=$TMPDIR
+ #export TEMP=$TMPDIR
+ #mkdir -p $TMPDIR
+
+ df -h
+ echo $TMPDIR
+ echo $RUNNER_TEMP
+
cd thirdparty
- ./build-thirdparty.sh -j "$(nproc)"
+ #./build-thirdparty.sh -j "$(nproc)"
+ ./build-thirdparty.sh -j 2
build_macos:
name: Build Third Party Libraries (macOS)
@@ -195,7 +206,12 @@ jobs:
export CMAKE_POLICY_VERSION_MINIMUM=3.10
export CUSTOM_CMAKE="/usr/local/bin/cmake"
+ df -h
+ echo $TMPDIR
+ echo $RUNNER_TEMP
+
cd thirdparty
+ #./build-thirdparty.sh -j "$(nproc)"
./build-thirdparty.sh -j "$(nproc)"
build_macos_arm64:
@@ -255,6 +271,11 @@ jobs:
export CMAKE_POLICY_VERSION_MINIMUM=3.10
export CUSTOM_CMAKE="/usr/local/bin/cmake"
+ df -h
+ echo $TMPDIR
+ echo $RUNNER_TEMP
+
cd thirdparty
- ./build-thirdparty.sh -j "$(nproc)"
+ #./build-thirdparty.sh -j "$(nproc)"
+ ./build-thirdparty.sh -j 2
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index 7e440719bd7..18a6f5ac4de 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -638,7 +638,7 @@ build_lz4() {
build_crc32c() {
check_if_source_exist "${CRC32C_SOURCE}"
cd "${TP_SOURCE_DIR}/${CRC32C_SOURCE}"
-
+
mkdir -p "${BUILD_DIR}"
cd "${BUILD_DIR}"
@@ -1090,6 +1090,10 @@ build_arrow() {
-DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON
-DARROW_USE_GLOG=ON \
-DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON
-DARROW_JSON=ON \
-DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \
+ -DARROW_COMPUTE=ON \
+ -DARROW_FILESYSTEM=ON \
+ -DARROW_DATASET=ON \
+ -DARROW_ACERO=ON \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DARROW_BOOST_USE_SHARED=OFF \
@@ -1137,6 +1141,8 @@ build_arrow() {
cp -rf ./brotli_ep/src/brotli_ep-install/lib/libbrotlicommon-static.a
"${TP_INSTALL_DIR}/lib64/libbrotlicommon.a"
strip_lib libarrow.a
strip_lib libparquet.a
+ strip_lib libarrow_dataset.a
+ strip_lib libarrow_acero.a
}
# abseil
@@ -1804,7 +1810,7 @@ build_libdeflate() {
cd "${BUILD_DIR}"
"${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
- -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DCMAKE_BUILD_TYPE=Release ..
+ -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DCMAKE_BUILD_TYPE=Release
..
"${BUILD_SYSTEM}" -j "${PARALLEL}"
"${BUILD_SYSTEM}" install
}
@@ -1979,6 +1985,89 @@ build_pugixml() {
cp "${TP_SOURCE_DIR}/${PUGIXML_SOURCE}/src/pugiconfig.hpp"
"${TP_INSTALL_DIR}/include/"
}
+# paimon-cpp
+build_paimon_cpp() {
+ check_if_source_exist "${PAIMON_CPP_SOURCE}"
+ cd "${TP_SOURCE_DIR}/${PAIMON_CPP_SOURCE}"
+
+ rm -rf "${BUILD_DIR}"
+ mkdir -p "${BUILD_DIR}"
+ cd "${BUILD_DIR}"
+
+ # Darwin doesn't build GNU libunwind in this script, so don't force
-lunwind there.
+ local paimon_linker_flags="-L${TP_LIB_DIR} -lbrotlienc -lbrotlidec
-lbrotlicommon -llzma"
+ if [[ "${KERNEL}" != 'Darwin' ]]; then
+ paimon_linker_flags="${paimon_linker_flags} -lunwind"
+ fi
+
+ CXXFLAGS="-Wno-nontrivial-memcall" \
+ "${CMAKE_CMD}" -C "${TP_DIR}/paimon-cpp-cache.cmake" \
+ -G "${GENERATOR}" \
+ -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
+ -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
+ -DPAIMON_BUILD_SHARED=OFF \
+ -DPAIMON_BUILD_STATIC=ON \
+ -DPAIMON_BUILD_TESTS=OFF \
+ -DPAIMON_ENABLE_ORC=ON \
+ -DPAIMON_ENABLE_AVRO=OFF \
+ -DPAIMON_ENABLE_LANCE=OFF \
+ -DPAIMON_ENABLE_JINDO=OFF \
+ -DPAIMON_ENABLE_LUMINA=OFF \
+ -DPAIMON_ENABLE_LUCENE=OFF \
+ -DCMAKE_EXE_LINKER_FLAGS="${paimon_linker_flags}" \
+ -DCMAKE_SHARED_LINKER_FLAGS="${paimon_linker_flags}" \
+ ..
+ "${BUILD_SYSTEM}" -j "${PARALLEL}"
+ "${BUILD_SYSTEM}" install
+
+ # Install paimon-cpp internal dependencies with renamed versions
+ # These libraries are built but not installed by default
+ echo "Installing paimon-cpp internal dependencies..."
+
+ # Arrow deps: When PAIMON_USE_EXTERNAL_ARROW=ON (Plan B), paimon-cpp
+ # reuses Doris's Arrow and does NOT build arrow_ep, so the paimon_deps
+ # directory is not needed. When building its own Arrow (legacy), copy
+ # arrow artefacts into an isolated directory to avoid clashing with Doris.
+ local paimon_deps_dir="${TP_INSTALL_DIR}/paimon-cpp/lib64/paimon_deps"
+ if [ -d "arrow_ep-install/lib" ]; then
+ mkdir -p "${paimon_deps_dir}"
+ for paimon_arrow_dep in \
+ libarrow.a \
+ libarrow_filesystem.a \
+ libarrow_dataset.a \
+ libarrow_acero.a \
+ libparquet.a; do
+ if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
+ cp -v "arrow_ep-install/lib/${paimon_arrow_dep}"
"${paimon_deps_dir}/${paimon_arrow_dep}"
+ fi
+ done
+ else
+ echo " arrow_ep-install not found (PAIMON_USE_EXTERNAL_ARROW=ON?) –
skipping paimon_deps Arrow copy"
+ fi
+
+ # Install roaring_bitmap, renamed to avoid conflict with Doris's
croaringbitmap
+ if [ -f "release/libroaring_bitmap.a" ]; then
+ cp -v "release/libroaring_bitmap.a"
"${TP_INSTALL_DIR}/lib64/libroaring_bitmap_paimon.a"
+ fi
+
+ # Install xxhash, renamed to avoid conflict with Doris's xxhash
+ if [ -f "release/libxxhash.a" ]; then
+ cp -v "release/libxxhash.a"
"${TP_INSTALL_DIR}/lib64/libxxhash_paimon.a"
+ fi
+
+ # Install fmt v11 (from fmt_ep-install directory, renamed to avoid
conflict with Doris's fmt v7)
+ if [ -f "fmt_ep-install/lib/libfmt.a" ]; then
+ cp -v "fmt_ep-install/lib/libfmt.a"
"${TP_INSTALL_DIR}/lib64/libfmt_paimon.a"
+ fi
+
+ # Install tbb (from tbb_ep-install directory, renamed to avoid conflict
with Doris's tbb)
+ if [ -f "tbb_ep-install/lib/libtbb.a" ]; then
+ cp -v "tbb_ep-install/lib/libtbb.a"
"${TP_INSTALL_DIR}/lib64/libtbb_paimon.a"
+ fi
+
+ echo "Paimon-cpp internal dependencies installed successfully"
+}
+
if [[ "${#packages[@]}" -eq 0 ]]; then
packages=(
jindofs
@@ -2051,6 +2140,7 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
brotli
icu
pugixml
+ paimon_cpp
)
if [[ "$(uname -s)" == 'Darwin' ]]; then
read -r -a packages <<<"binutils gettext ${packages[*]}"
@@ -2060,6 +2150,122 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
fi
fi
+# Map a package name to its source directory variable(s) and remove them to
free disk space.
+# This is called after each package is built and installed successfully.
+cleanup_package_source() {
+ local pkg="$1"
+ local src_var
+ local src_dir
+
+ # Map package name to the uppercase *_SOURCE variable name
+ case "${pkg}" in
+ libevent) src_var="LIBEVENT_SOURCE" ;;
+ openssl) src_var="OPENSSL_SOURCE" ;;
+ thrift) src_var="THRIFT_SOURCE" ;;
+ protobuf) src_var="PROTOBUF_SOURCE" ;;
+ gflags) src_var="GFLAGS_SOURCE" ;;
+ glog) src_var="GLOG_SOURCE" ;;
+ gtest) src_var="GTEST_SOURCE" ;;
+ rapidjson) src_var="RAPIDJSON_SOURCE" ;;
+ snappy) src_var="SNAPPY_SOURCE" ;;
+ gperftools) src_var="GPERFTOOLS_SOURCE" ;;
+ zlib) src_var="ZLIB_SOURCE" ;;
+ crc32c) src_var="CRC32C_SOURCE" ;;
+ lz4) src_var="LZ4_SOURCE" ;;
+ bzip) src_var="BZIP_SOURCE" ;;
+ lzo2) src_var="LZO2_SOURCE" ;;
+ zstd) src_var="ZSTD_SOURCE" ;;
+ #boost) src_var="BOOST_SOURCE" ;; // boost is used for mysql
later
+ abseil) src_var="ABSEIL_SOURCE" ;;
+ curl) src_var="CURL_SOURCE" ;;
+ re2) src_var="RE2_SOURCE" ;;
+ hyperscan)
+ # hyperscan also builds ragel, clean both
+ if [[ -n "${RAGEL_SOURCE}" && -d
"${TP_SOURCE_DIR}/${RAGEL_SOURCE}" ]]; then
+ echo "Cleaning up source: ${RAGEL_SOURCE}"
+ rm -rf "${TP_SOURCE_DIR}/${RAGEL_SOURCE}"
+ fi
+ src_var="HYPERSCAN_SOURCE"
+ ;;
+ mysql) src_var="MYSQL_SOURCE" ;;
+ odbc) src_var="ODBC_SOURCE" ;;
+ leveldb) src_var="LEVELDB_SOURCE" ;;
+ brpc) src_var="BRPC_SOURCE" ;;
+ rocksdb) src_var="ROCKSDB_SOURCE" ;;
+ cyrus_sasl) src_var="CYRUS_SASL_SOURCE" ;;
+ librdkafka) src_var="LIBRDKAFKA_SOURCE" ;;
+ flatbuffers) src_var="FLATBUFFERS_SOURCE" ;;
+ arrow) src_var="ARROW_SOURCE" ;;
+ brotli) src_var="BROTLI_SOURCE" ;;
+ cares) src_var="CARES_SOURCE" ;;
+ grpc) src_var="GRPC_SOURCE" ;;
+ s2) src_var="S2_SOURCE" ;;
+ bitshuffle) src_var="BITSHUFFLE_SOURCE" ;;
+ croaringbitmap) src_var="CROARINGBITMAP_SOURCE" ;;
+ fmt) src_var="FMT_SOURCE" ;;
+ parallel_hashmap) src_var="PARALLEL_HASHMAP_SOURCE" ;;
+ orc) src_var="ORC_SOURCE" ;;
+ cctz) src_var="CCTZ_SOURCE" ;;
+ jemalloc_doris) src_var="JEMALLOC_DORIS_SOURCE" ;;
+ libunwind) src_var="LIBUNWIND_SOURCE" ;;
+ benchmark) src_var="BENCHMARK_SOURCE" ;;
+ simdjson) src_var="SIMDJSON_SOURCE" ;;
+ nlohmann_json) src_var="NLOHMANN_JSON_SOURCE" ;;
+ libbacktrace) src_var="LIBBACKTRACE_SOURCE" ;;
+ sse2neon) src_var="SSE2NEON_SOURCE" ;;
+ xxhash) src_var="XXHASH_SOURCE" ;;
+ concurrentqueue) src_var="CONCURRENTQUEUE_SOURCE" ;;
+ fast_float) src_var="FAST_FLOAT_SOURCE" ;;
+ hadoop_libs) src_var="HADOOP_LIBS_SOURCE" ;;
+ hadoop_libs_3_4) src_var="HADOOP_LIBS_3_4_SOURCE" ;;
+ avx2neon) src_var="AVX2NEON_SOURCE" ;;
+ libdeflate) src_var="LIBDEFLATE_SOURCE" ;;
+ streamvbyte) src_var="STREAMVBYTE_SOURCE" ;;
+ ali_sdk)
+ # ali_sdk internally builds jsoncpp and libuuid, clean all three
+ for dep_var in JSONCPP_SOURCE LIBUUID_SOURCE ALI_SDK_SOURCE; do
+ dep_dir="${!dep_var}"
+ if [[ -n "${dep_dir}" && -d "${TP_SOURCE_DIR}/${dep_dir}" ]];
then
+ echo "Cleaning up source: ${dep_dir}"
+ rm -rf "${TP_SOURCE_DIR}/${dep_dir}"
+ fi
+ done
+ return
+ ;;
+ base64) src_var="BASE64_SOURCE" ;;
+ azure) src_var="AZURE_SOURCE" ;;
+ dragonbox) src_var="DRAGONBOX_SOURCE" ;;
+ icu) src_var="ICU_SOURCE" ;;
+ jindofs) src_var="JINDOFS_SOURCE" ;;
+ pugixml) src_var="PUGIXML_SOURCE" ;;
+ paimon_cpp) src_var="PAIMON_CPP_SOURCE" ;;
+ aws_sdk) src_var="AWS_SDK_SOURCE" ;;
+ lzma) src_var="LZMA_SOURCE" ;;
+ xml2) src_var="XML2_SOURCE" ;;
+ idn) src_var="IDN_SOURCE" ;;
+ gsasl) src_var="GSASL_SOURCE" ;;
+ krb5) src_var="KRB5_SOURCE" ;;
+ hdfs3) src_var="HDFS3_SOURCE" ;;
+ libdivide) src_var="LIBDIVIDE_SOURCE" ;;
+ binutils) src_var="BINUTILS_SOURCE" ;;
+ gettext) src_var="GETTEXT_SOURCE" ;;
+ # Header-only files, skip cleanup
+ pdqsort|timsort|tsan_header|js_and_css)
+ return
+ ;;
+ *)
+ echo "Warning: no source mapping for package '${pkg}', skipping
cleanup"
+ return
+ ;;
+ esac
+
+ src_dir="${!src_var}"
+ if [[ -n "${src_dir}" && -d "${TP_SOURCE_DIR}/${src_dir}" ]]; then
+ echo "Cleaning up source: ${src_dir}"
+ rm -rf "${TP_SOURCE_DIR}/${src_dir}"
+ fi
+}
+
for package in "${packages[@]}"; do
if [[ "${package}" == "${start_package}" ]]; then
PACKAGE_FOUND=1
@@ -2067,6 +2273,11 @@ for package in "${packages[@]}"; do
if [[ "${CONTINUE}" -eq 0 ]] || [[ "${PACKAGE_FOUND}" -eq 1 ]]; then
command="build_${package}"
${command}
+ cd "${TP_DIR}"
+ cleanup_package_source "${package}"
+ echo "debug after clean: ${package}"
+ df -h
+ du -sh "${TP_DIR}"
fi
done
diff --git a/thirdparty/download-thirdparty.sh
b/thirdparty/download-thirdparty.sh
index f3b999115b7..d85b86704d3 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -103,6 +103,18 @@ md5sum_func() {
return 0
}
+is_git_package() {
+ local TP_ARCH="$1"
+ local GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+ [[ -n "${!GIT_URL_VAR}" ]]
+}
+
+git_url_for() {
+ local TP_ARCH="$1"
+ local GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+ echo "${!GIT_URL_VAR}"
+}
+
# return 0 if download succeed.
# return 1 if not.
download_func() {
@@ -159,6 +171,10 @@ download_func() {
# download thirdparty archives
echo "===== Downloading thirdparty archives..."
for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+ if is_git_package "${TP_ARCH}"; then
+ echo "Skip downloading ${TP_ARCH} (git repo: $(git_url_for
"${TP_ARCH}"))"
+ continue
+ fi
NAME="${TP_ARCH}_NAME"
MD5SUM="${TP_ARCH}_MD5SUM"
if [[ -z "${REPOSITORY_URL}" ]]; then
@@ -184,6 +200,9 @@ echo "===== Downloading thirdparty archives...done"
# check if all tp archives exists
echo "===== Checking all thirdpart archives..."
for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+ if is_git_package "${TP_ARCH}"; then
+ continue
+ fi
NAME="${TP_ARCH}_NAME"
if [[ ! -r "${TP_SOURCE_DIR}/${!NAME}" ]]; then
echo "Failed to fetch ${!NAME}"
@@ -201,6 +220,9 @@ SUFFIX_XZ="\.tar\.xz$"
SUFFIX_ZIP="\.zip$"
SUFFIX_BZ2="\.tar\.bz2$"
for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+ if is_git_package "${TP_ARCH}"; then
+ continue
+ fi
NAME="${TP_ARCH}_NAME"
SOURCE="${TP_ARCH}_SOURCE"
@@ -240,6 +262,57 @@ for TP_ARCH in "${TP_ARCHIVES[@]}"; do
done
echo "===== Unpacking all thirdparty archives...done"
+# Clone and checkout git repositories
+echo "===== Cloning git repositories..."
+for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+ if ! is_git_package "${TP_ARCH}"; then
+ continue
+ fi
+
+ GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+ GIT_TAG_VAR="${TP_ARCH}_GIT_TAG"
+ SOURCE_VAR="${TP_ARCH}_SOURCE"
+
+ GIT_URL="${!GIT_URL_VAR}"
+ GIT_TAG="${!GIT_TAG_VAR}"
+ SOURCE_DIR="${TP_SOURCE_DIR}/${!SOURCE_VAR}"
+
+ if [[ -z "${GIT_URL}" ]] || [[ -z "${GIT_TAG}" ]] || [[ -z
"${!SOURCE_VAR}" ]]; then
+ echo "Warning: ${TP_ARCH} git configuration incomplete, skipping"
+ continue
+ fi
+
+ if [[ ! -d "${SOURCE_DIR}" ]]; then
+ echo "Cloning ${TP_ARCH} from ${GIT_URL}..."
+ cd "${TP_SOURCE_DIR}"
+ if ! git clone "${GIT_URL}" "${!SOURCE_VAR}"; then
+ echo "Failed to clone ${TP_ARCH}"
+ exit 1
+ fi
+ else
+ echo "${TP_ARCH} repository already exists, updating..."
+ cd "${SOURCE_DIR}"
+ git fetch origin || true
+ fi
+
+ cd "${SOURCE_DIR}"
+ if ! git checkout "${GIT_TAG}" 2>/dev/null; then
+ echo "Tag ${GIT_TAG} not found, trying to fetch..."
+ is_shallow="$(git rev-parse --is-shallow-repository 2>/dev/null ||
echo false)"
+ if [[ "${is_shallow}" == "true" ]]; then
+ git fetch --unshallow origin || git fetch --depth=2147483647 origin
+ else
+ git fetch origin
+ fi
+ if ! git checkout "${GIT_TAG}"; then
+ echo "Failed to checkout ${GIT_TAG} for ${TP_ARCH}"
+ exit 1
+ fi
+ fi
+ echo "Successfully checked out ${GIT_TAG} for ${TP_ARCH}"
+done
+echo "===== Cloning git repositories...done"
+
echo "===== Patching thirdparty archives..."
###################################################################################
@@ -352,6 +425,16 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
fi
cd -
fi
+ if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-17.0.0" ]]; then
+ cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
+ if [[ ! -f "${PATCHED_MARK}" ]]; then
+ # Paimon-cpp parquet patches: row-group-aware batch reader,
max_row_group_size,
+ # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty
fix.
+ patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+ touch "${PATCHED_MARK}"
+ fi
+ cd -
+ fi
echo "Finished patching ${ARROW_SOURCE}"
fi
@@ -612,6 +695,21 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " AZURE " ]]; then
echo "Finished patching ${AZURE_SOURCE}"
fi
+# patch paimon-cpp
+if [[ " ${TP_ARCHIVES[*]} " =~ " PAIMON_CPP " ]]; then
+ cd "${TP_SOURCE_DIR}/${PAIMON_CPP_SOURCE}"
+ if [[ ! -f "${PATCHED_MARK}" ]]; then
+ if patch -p1 -N --batch --dry-run
<"${TP_PATCH_DIR}/paimon-cpp-buildutils-static-deps.patch" >/dev/null 2>&1; then
+ patch -p1 -N --batch
<"${TP_PATCH_DIR}/paimon-cpp-buildutils-static-deps.patch"
+ else
+ echo "Skip paimon-cpp patch: already applied or not applicable for
current source"
+ fi
+ touch "${PATCHED_MARK}"
+ fi
+ cd -
+ echo "Finished patching ${PAIMON_CPP_SOURCE}"
+fi
+
if [[ " ${TP_ARCHIVES[*]} " =~ " CCTZ " ]] ; then
cd $TP_SOURCE_DIR/$CCTZ_SOURCE
if [[ ! -f "$PATCHED_MARK" ]] ; then
diff --git a/thirdparty/paimon-cpp-cache.cmake
b/thirdparty/paimon-cpp-cache.cmake
new file mode 100644
index 00000000000..dbebd94a0cc
--- /dev/null
+++ b/thirdparty/paimon-cpp-cache.cmake
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# CMake Initial Cache for paimon-cpp
+# Configures paimon-cpp to reuse selected Doris third-party libraries
+# Usage: cmake -C paimon-cpp-cache.cmake ...
+
+# Get the Doris thirdparty installation directory from environment
+set(DORIS_THIRDPARTY_DIR "$ENV{TP_INSTALL_DIR}" CACHE PATH "Doris thirdparty
install directory")
+
+if(NOT DORIS_THIRDPARTY_DIR)
+ message(FATAL_ERROR "TP_INSTALL_DIR environment variable must be set")
+endif()
+
+message(STATUS "Using Doris thirdparty libraries from:
${DORIS_THIRDPARTY_DIR}")
+
+# Set CMAKE_PREFIX_PATH to help find_package locate our libraries
+set(CMAKE_PREFIX_PATH "${DORIS_THIRDPARTY_DIR};${CMAKE_PREFIX_PATH}" CACHE
STRING "Search path for find_package")
+
+# Library and include paths
+set(DORIS_LIB_DIR "${DORIS_THIRDPARTY_DIR}/lib" CACHE PATH "Doris library
directory")
+set(DORIS_INCLUDE_DIR "${DORIS_THIRDPARTY_DIR}/include" CACHE PATH "Doris
include directory")
+
+# ============================================================================
+# ZLIB - Reuse from Doris (version 1.3.1)
+# ============================================================================
+set(ZLIB_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "ZLIB root directory")
+set(ZLIB_LIBRARY "${DORIS_LIB_DIR}/libz.a" CACHE FILEPATH "ZLIB library")
+set(ZLIB_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "ZLIB include
directory")
+
+# ============================================================================
+# ZSTD - Reuse from Doris (version 1.5.7)
+# ============================================================================
+set(ZSTD_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "ZSTD root directory")
+set(ZSTD_LIBRARY "${DORIS_LIB_DIR}/libzstd.a" CACHE FILEPATH "ZSTD library")
+set(ZSTD_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "ZSTD include
directory")
+
+# ============================================================================
+# LZ4 - Reuse from Doris (version 1.9.4)
+# ============================================================================
+set(LZ4_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "LZ4 root directory")
+set(LZ4_LIBRARY "${DORIS_LIB_DIR}/liblz4.a" CACHE FILEPATH "LZ4 library")
+set(LZ4_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "LZ4 include directory")
+
+# ============================================================================
+# glog - NOT reused from Doris
+# paimon-cpp's build_glog() unconditionally calls externalproject_add() to
+# build glog 0.7.1. Any GLOG_ROOT/GLOG_LIBRARY/GLOG_INCLUDE_DIR set here
+# would be overwritten by that macro, so we skip them entirely.
+# ============================================================================
+
+# ============================================================================
+# Arrow - Reuse from Doris (Doris Arrow now includes
COMPUTE/DATASET/ACERO/FILESYSTEM)
+# Doris's Arrow 17.0.0 is built with the full module set that paimon-cpp
+# needs, so we skip paimon-cpp's internal externalproject_add(arrow_ep ...).
+# ============================================================================
+set(PAIMON_USE_EXTERNAL_ARROW ON CACHE BOOL "Use pre-built Arrow from Doris
instead of building from source")
+
+set(DORIS_LIB64_DIR "${DORIS_THIRDPARTY_DIR}/lib64" CACHE PATH "Doris lib64
directory")
+
+set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Arrow
include directory")
+set(PAIMON_EXTERNAL_ARROW_LIB "${DORIS_LIB64_DIR}/libarrow.a" CACHE FILEPATH
"Arrow core library")
+set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "${DORIS_LIB64_DIR}/libarrow_dataset.a"
CACHE FILEPATH "Arrow Dataset library")
+set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "${DORIS_LIB64_DIR}/libarrow_acero.a"
CACHE FILEPATH "Arrow Acero library")
+set(PAIMON_EXTERNAL_PARQUET_LIB "${DORIS_LIB64_DIR}/libparquet.a" CACHE
FILEPATH "Parquet library")
+set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB
"${DORIS_LIB64_DIR}/libarrow_bundled_dependencies.a" CACHE FILEPATH "Arrow
bundled dependencies library")
+
+# Protobuf, Thrift - still built separately by paimon-cpp
+
+# ============================================================================
+# Snappy - Reuse from Doris
+# ============================================================================
+set(Snappy_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "Snappy root directory")
+set(SNAPPY_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "Snappy root directory
(legacy)")
+set(SNAPPY_LIBRARY "${DORIS_LIB_DIR}/libsnappy.a" CACHE FILEPATH "Snappy
library")
+set(SNAPPY_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Snappy include
directory")
+
+# ============================================================================
+# Build configuration
+# ============================================================================
+set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Build with -fPIC")
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type")
+
+# Symbol visibility control to prevent conflicts with Doris
+# paimon-cpp builds Arrow/ORC/etc with hidden symbols to avoid conflicts
+set(CMAKE_CXX_VISIBILITY_PRESET "hidden" CACHE STRING "Hide C++ symbols by
default")
+set(CMAKE_C_VISIBILITY_PRESET "hidden" CACHE STRING "Hide C symbols by
default")
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON CACHE BOOL "Hide inline function
symbols")
+
+# Verify that required libraries exist
+if(NOT EXISTS "${ZLIB_LIBRARY}")
+ message(FATAL_ERROR "ZLIB library not found: ${ZLIB_LIBRARY}")
+endif()
+if(NOT EXISTS "${ZSTD_LIBRARY}")
+ message(FATAL_ERROR "ZSTD library not found: ${ZSTD_LIBRARY}")
+endif()
+if(NOT EXISTS "${LZ4_LIBRARY}")
+ message(FATAL_ERROR "LZ4 library not found: ${LZ4_LIBRARY}")
+endif()
+if(NOT EXISTS "${SNAPPY_LIBRARY}")
+ message(FATAL_ERROR "Snappy library not found: ${SNAPPY_LIBRARY}")
+endif()
+
+message(STATUS "========================================")
+message(STATUS "Paimon-cpp Library Reuse Configuration")
+message(STATUS "========================================")
+message(STATUS "Reusing from Doris:")
+message(STATUS " ✓ ZLIB, ZSTD, LZ4, Snappy")
+if(PAIMON_USE_EXTERNAL_ARROW)
+ message(STATUS " ✓ Arrow, Parquet, Arrow Dataset, Arrow Acero (Plan B)")
+else()
+ message(STATUS " ✗ Arrow (building separately, symbol visibility=hidden)")
+endif()
+message(STATUS "")
+message(STATUS "Building separately:")
+if(NOT PAIMON_USE_EXTERNAL_ARROW)
+ message(STATUS " - Arrow, Protobuf, Thrift, ORC")
+else()
+ message(STATUS " - Protobuf, Thrift, ORC")
+endif()
+message(STATUS " - glog, RapidJSON, TBB")
+message(STATUS "========================================")
diff --git a/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
new file mode 100644
index 00000000000..4e53117b79b
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
@@ -0,0 +1,224 @@
+diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
+index ec3890a41f..943f69bb6c 100644
+--- a/cpp/src/parquet/arrow/schema.cc
++++ b/cpp/src/parquet/arrow/schema.cc
+@@ -178,7 +178,7 @@ static Status GetTimestampMetadata(const
::arrow::TimestampType& type,
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+- if (arrow_properties.support_deprecated_int96_timestamps()) {
++ if (arrow_properties.support_deprecated_int96_timestamps() && target_unit
== ::arrow::TimeUnit::NANO) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+
+diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
+index 285e2a5973..aa6f92f077 100644
+--- a/cpp/src/parquet/arrow/reader.cc
++++ b/cpp/src/parquet/arrow/reader.cc
+@@ -1013,25 +1013,32 @@ Status FileReaderImpl::GetRecordBatchReader(const
std::vector<int>& row_groups,
+ return Status::OK();
+ }
+
+- int64_t num_rows = 0;
++ std::vector<int64_t> num_rows;
+ for (int row_group : row_groups) {
+- num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
++
num_rows.push_back(parquet_reader()->metadata()->RowGroup(row_group)->num_rows());
+ }
+
+ using ::arrow::RecordBatchIterator;
++ int row_group_idx = 0;
+
+ // NB: This lambda will be invoked outside the scope of this call to
+ // `GetRecordBatchReader()`, so it must capture `readers` and
`batch_schema` by value.
+ // `this` is a non-owning pointer so we are relying on the parent
FileReader outliving
+ // this RecordBatchReader.
+ ::arrow::Iterator<RecordBatchIterator> batches =
::arrow::MakeFunctionIterator(
+- [readers, batch_schema, num_rows,
++ [readers, batch_schema, num_rows, row_group_idx,
+ this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+ ::arrow::ChunkedArrayVector columns(readers.size());
+
+- // don't reserve more rows than necessary
+- int64_t batch_size = std::min(properties().batch_size(), num_rows);
+- num_rows -= batch_size;
++ int64_t batch_size = 0;
++ if (!num_rows.empty()) {
++ // don't reserve more rows than necessary
++ batch_size = std::min(properties().batch_size(),
num_rows[row_group_idx]);
++ num_rows[row_group_idx] -= batch_size;
++ if (num_rows[row_group_idx] == 0 && (num_rows.size() - 1) !=
row_group_idx) {
++ row_group_idx++;
++ }
++ }
+
+ RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+ reader_properties_.use_threads(),
static_cast<int>(readers.size()),
+diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
+index 4fd7ef1b47..87326a54f1 100644
+--- a/cpp/src/parquet/arrow/writer.cc
++++ b/cpp/src/parquet/arrow/writer.cc
+@@ -314,6 +314,14 @@ class FileWriterImpl : public FileWriter {
+ return Status::OK();
+ }
+
++ int64_t GetBufferedSize() override {
++ if (row_group_writer_ == nullptr) {
++ return 0;
++ }
++ return row_group_writer_->total_compressed_bytes() +
++ row_group_writer_->total_compressed_bytes_written();
++ }
++
+ Status Close() override {
+ if (!closed_) {
+ // Make idempotent
+@@ -418,10 +426,13 @@ class FileWriterImpl : public FileWriter {
+
+ // Max number of rows allowed in a row group.
+ const int64_t max_row_group_length =
this->properties().max_row_group_length();
++ const int64_t max_row_group_size =
this->properties().max_row_group_size();
+
+ // Initialize a new buffered row group writer if necessary.
+ if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
+- row_group_writer_->num_rows() >= max_row_group_length) {
++ row_group_writer_->num_rows() >= max_row_group_length ||
++ (row_group_writer_->total_compressed_bytes_written() +
++ row_group_writer_->total_compressed_bytes() >= max_row_group_size)) {
+ RETURN_NOT_OK(NewBufferedRowGroup());
+ }
+
+diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
+index 4a1a033a7b..0f13d05e44 100644
+--- a/cpp/src/parquet/arrow/writer.h
++++ b/cpp/src/parquet/arrow/writer.h
+@@ -138,6 +138,9 @@ class PARQUET_EXPORT FileWriter {
+ /// option in this case.
+ virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch)
= 0;
+
++ /// \brief Return the buffered size in bytes.
++ virtual int64_t GetBufferedSize() = 0;
++
+ /// \brief Write the footer and close the file.
+ virtual ::arrow::Status Close() = 0;
+ virtual ~FileWriter();
+diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
+index 4d3acb491e..3906ff3c59 100644
+--- a/cpp/src/parquet/properties.h
++++ b/cpp/src/parquet/properties.h
+@@ -139,6 +139,7 @@ static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+ static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT =
kDefaultDataPageSize;
+ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+ static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
++static constexpr int64_t DEFAULT_MAX_ROW_GROUP_SIZE = 128 * 1024 * 1024;
+ static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+ static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
+@@ -232,6 +233,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+ write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+ max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
++ max_row_group_size_(DEFAULT_MAX_ROW_GROUP_SIZE),
+ pagesize_(kDefaultDataPageSize),
+ version_(ParquetVersion::PARQUET_2_6),
+ data_page_version_(ParquetDataPageVersion::V1),
+@@ -244,6 +246,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
+ write_batch_size_(properties.write_batch_size()),
+ max_row_group_length_(properties.max_row_group_length()),
++ max_row_group_size_(properties.max_row_group_size()),
+ pagesize_(properties.data_pagesize()),
+ version_(properties.version()),
+ data_page_version_(properties.data_page_version()),
+@@ -321,6 +324,13 @@ class PARQUET_EXPORT WriterProperties {
+ return this;
+ }
+
++ /// Specify the max bytes size to put in a single row group.
++ /// Default 128 M.
++ Builder* max_row_group_size(int64_t max_row_group_size) {
++ max_row_group_size_ = max_row_group_size;
++ return this;
++ }
++
+ /// Specify the data page size.
+ /// Default 1MB.
+ Builder* data_pagesize(int64_t pg_size) {
+@@ -664,7 +674,7 @@ class PARQUET_EXPORT WriterProperties {
+
+ return std::shared_ptr<WriterProperties>(new WriterProperties(
+ pool_, dictionary_pagesize_limit_, write_batch_size_,
max_row_group_length_,
+- pagesize_, version_, created_by_, page_checksum_enabled_,
++ max_row_group_size_, pagesize_, version_, created_by_,
page_checksum_enabled_,
+ std::move(file_encryption_properties_), default_column_properties_,
+ column_properties, data_page_version_, store_decimal_as_integer_,
+ std::move(sorting_columns_)));
+@@ -675,6 +685,7 @@ class PARQUET_EXPORT WriterProperties {
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
++ int64_t max_row_group_size_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ ParquetDataPageVersion data_page_version_;
+@@ -705,6 +716,8 @@ class PARQUET_EXPORT WriterProperties {
+
+ inline int64_t max_row_group_length() const { return max_row_group_length_;
}
+
++ inline int64_t max_row_group_size() const { return max_row_group_size_; }
++
+ inline int64_t data_pagesize() const { return pagesize_; }
+
+ inline ParquetDataPageVersion data_page_version() const {
+@@ -810,7 +823,7 @@ class PARQUET_EXPORT WriterProperties {
+ private:
+ explicit WriterProperties(
+ MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t
write_batch_size,
+- int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type
version,
++ int64_t max_row_group_length, int64_t max_row_group_size, int64_t
pagesize, ParquetVersion::type version,
+ const std::string& created_by, bool page_write_checksum_enabled,
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+ const ColumnProperties& default_column_properties,
+@@ -821,6 +834,7 @@ class PARQUET_EXPORT WriterProperties {
+ dictionary_pagesize_limit_(dictionary_pagesize_limit),
+ write_batch_size_(write_batch_size),
+ max_row_group_length_(max_row_group_length),
++ max_row_group_size_(max_row_group_size),
+ pagesize_(pagesize),
+ parquet_data_page_version_(data_page_version),
+ parquet_version_(version),
+@@ -836,6 +850,7 @@ class PARQUET_EXPORT WriterProperties {
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
++ int64_t max_row_group_size_;
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
+diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+index 9df922afa2..5c8b3d4d07 100644
+--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
+ REQUIRED_VERSION
+ 0.11.0)
+
+- string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
++ if(NOT Thrift_VERSION)
++ if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
++ set(Thrift_VERSION "${thrift_PC_VERSION}")
++ elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
++ set(Thrift_VERSION "${ThriftAlt_VERSION}")
++ elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
++ set(Thrift_VERSION "${THRIFT_VERSION}")
++ endif()
++ endif()
++ if(NOT Thrift_VERSION)
++ message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift
dependency")
++ endif()
++
++ string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
+ list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
+ list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
+ list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
new file mode 100644
index 00000000000..31af1db7f0f
--- /dev/null
+++ b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
@@ -0,0 +1,331 @@
+diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake
+index 74654a4..4065297 100644
+--- a/cmake_modules/BuildUtils.cmake
++++ b/cmake_modules/BuildUtils.cmake
+@@ -55,12 +55,18 @@ function(add_paimon_lib LIB_NAME)
+ # Necessary to make static linking into other shared libraries work
properly
+ set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE
1)
+ if(ARG_DEPENDENCIES)
+- # Avoid add_dependencies on non-existent targets (e.g. when building
static only).
++ # In static-only builds, some dependency names are still declared as
++ # *_shared. Map them to *_static when the shared target is
unavailable.
+ set(_paimon_objlib_deps)
+ foreach(_paimon_dep IN LISTS ARG_DEPENDENCIES)
+- if(TARGET ${_paimon_dep})
+- list(APPEND _paimon_objlib_deps ${_paimon_dep})
++ set(_paimon_mapped_dep "${_paimon_dep}")
++ if(NOT TARGET ${_paimon_mapped_dep} AND _paimon_dep MATCHES
"_shared$")
++ string(REGEX REPLACE "_shared$" "_static" _paimon_mapped_dep
"${_paimon_dep}")
+ endif()
++ if(TARGET ${_paimon_mapped_dep})
++ list(APPEND _paimon_objlib_deps ${_paimon_mapped_dep})
++ endif()
++ unset(_paimon_mapped_dep)
+ endforeach()
+ if(_paimon_objlib_deps)
+ add_dependencies(${LIB_NAME}_objlib ${_paimon_objlib_deps})
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -909,9 +909,6 @@ macro(build_orc)
+ "-DCMAKE_CXX_FLAGS=${ORC_CMAKE_CXX_FLAGS}"
+ "-DCMAKE_C_FLAGS=${ORC_CMAKE_C_FLAGS}"
+ "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${ORC_CMAKE_CXX_FLAGS}"
+- "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+- "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+- "-DCMAKE_MODULE_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+ "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}"
+ "-DLZ4_HOME=${ORC_LZ4_ROOT}"
+ "-DZSTD_HOME=${ORC_ZSTD_ROOT}"
+@@ -923,6 +920,13 @@ macro(build_orc)
+ -DBUILD_TOOLS=OFF
+ -DBUILD_CPP_ENABLE_METRICS=ON)
+
++ if(ORC_RPATH)
++ list(APPEND ORC_CMAKE_ARGS
++ "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
++ "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
++ "-DCMAKE_MODULE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}")
++ endif()
++
+ set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/orc.diff")
+ externalproject_add(orc_ep
+ URL ${ORC_SOURCE_URL}
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1024,6 +1024,14 @@ macro(build_arrow)
+ "-DCMAKE_C_FLAGS=${ARROW_CMAKE_C_FLAGS}"
+ "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${ARROW_CMAKE_CXX_FLAGS}"
+ -DARROW_DEPENDENCY_USE_SHARED=OFF
++ # Avoid forcing CONDA dependency mode when CONDA_PREFIX is present.
++ # AUTO keeps the normal "find system first, fallback to bundled"
++ # behavior and prevents accidental pickup of conda's thrift/zstd.
++ -DARROW_DEPENDENCY_SOURCE=AUTO
++ # Isolate from user/system CMake package registries to improve
++ # reproducibility in CI and local mixed environments.
++ -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF
++ -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF
+ -DARROW_BUILD_SHARED=OFF
+ -DARROW_BUILD_STATIC=ON
+ -DARROW_BUILD_TESTS=OFF
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -761,6 +761,9 @@ macro(build_protobuf)
+ get_target_property(THIRDPARTY_ZLIB_INCLUDE_DIR zlib
INTERFACE_INCLUDE_DIRECTORIES)
+ get_filename_component(THIRDPARTY_ZLIB_ROOT
"${THIRDPARTY_ZLIB_INCLUDE_DIR}"
+ DIRECTORY)
++ set(THIRDPARTY_ZLIB_STATIC_LIB
++
"${THIRDPARTY_ZLIB_ROOT}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX}"
++ )
+
+ # Strip lto flags (which may be added by dh_auto_configure)
+ # See https://github.com/protocolbuffers/protobuf/issues/7092
+@@ -778,6 +781,10 @@ macro(build_protobuf)
+ "-DCMAKE_CXX_FLAGS=${PROTOBUF_CXX_FLAGS}"
+ "-DCMAKE_C_FLAGS=${PROTOBUF_C_FLAGS}"
+ "-DZLIB_ROOT=${THIRDPARTY_ZLIB_ROOT}"
++ "-DZLIB_INCLUDE_DIR=${THIRDPARTY_ZLIB_INCLUDE_DIR}"
++ "-DZLIB_LIBRARY=${THIRDPARTY_ZLIB_STATIC_LIB}"
++ "-DZLIB_LIBRARY_RELEASE=${THIRDPARTY_ZLIB_STATIC_LIB}"
++ "-DZLIB_LIBRARY_DEBUG=${THIRDPARTY_ZLIB_STATIC_LIB}"
+ -Dprotobuf_BUILD_TESTS=OFF
+ -Dprotobuf_DEBUG_POSTFIX=)
+ set(PROTOBUF_CONFIGURE SOURCE_SUBDIR "cmake" CMAKE_ARGS
${PROTOBUF_CMAKE_ARGS})
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -34,6 +34,16 @@ set(EP_COMMON_TOOLCHAIN
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+ "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+
++option(PAIMON_USE_EXTERNAL_ARROW "Reuse external Arrow/Parquet instead of
building arrow_ep" OFF)
++set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "" CACHE PATH
++ "Include directory for external Arrow/Parquet headers")
++set(PAIMON_EXTERNAL_ARROW_LIB "" CACHE FILEPATH "Path to external libarrow.a")
++set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "" CACHE FILEPATH "Path to external
libarrow_dataset.a")
++set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "" CACHE FILEPATH "Path to external
libarrow_acero.a")
++set(PAIMON_EXTERNAL_PARQUET_LIB "" CACHE FILEPATH "Path to external
libparquet.a")
++set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB "" CACHE FILEPATH
++ "Path to external libarrow_bundled_dependencies.a")
++
+ macro(set_urls URLS)
+ set(${URLS} ${ARGN})
+ endmacro()
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -961,5 +961,95 @@ macro(build_orc)
+ endmacro()
+
+ macro(build_arrow)
+- message(STATUS "Building Arrow from source")
++ if(PAIMON_USE_EXTERNAL_ARROW)
++ set(ARROW_INCLUDE_DIR
"${CMAKE_CURRENT_BINARY_DIR}/doris_external_arrow_include")
++ file(MAKE_DIRECTORY "${ARROW_INCLUDE_DIR}")
++ if(NOT EXISTS "${ARROW_INCLUDE_DIR}/arrow")
++ execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++ "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/arrow"
++ "${ARROW_INCLUDE_DIR}/arrow")
++ endif()
++ if(EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++ AND NOT EXISTS "${ARROW_INCLUDE_DIR}/parquet")
++ execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++ "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++ "${ARROW_INCLUDE_DIR}/parquet")
++ endif()
++
++ if(NOT PAIMON_EXTERNAL_ARROW_INCLUDE_DIR)
++ message(FATAL_ERROR
++ "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR must be set when
PAIMON_USE_EXTERNAL_ARROW=ON"
++ )
++ endif()
++ if(NOT EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}")
++ message(FATAL_ERROR
++ "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR not found:
${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}"
++ )
++ endif()
++
++ foreach(_paimon_external_lib
++ IN ITEMS PAIMON_EXTERNAL_ARROW_LIB
++ PAIMON_EXTERNAL_ARROW_DATASET_LIB
++ PAIMON_EXTERNAL_ARROW_ACERO_LIB
++ PAIMON_EXTERNAL_PARQUET_LIB
++ PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB)
++ if(NOT ${_paimon_external_lib})
++ message(FATAL_ERROR
++ "${_paimon_external_lib} must be set when
PAIMON_USE_EXTERNAL_ARROW=ON")
++ endif()
++ if(NOT EXISTS "${${_paimon_external_lib}}")
++ message(FATAL_ERROR
++ "${_paimon_external_lib} not found:
${${_paimon_external_lib}}")
++ endif()
++ endforeach()
++
++ add_library(arrow STATIC IMPORTED)
++ set_target_properties(arrow
++ PROPERTIES IMPORTED_LOCATION
"${PAIMON_EXTERNAL_ARROW_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_dataset STATIC IMPORTED)
++ set_target_properties(arrow_dataset
++ PROPERTIES IMPORTED_LOCATION
++
"${PAIMON_EXTERNAL_ARROW_DATASET_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_acero STATIC IMPORTED)
++ set_target_properties(arrow_acero
++ PROPERTIES IMPORTED_LOCATION
++ "${PAIMON_EXTERNAL_ARROW_ACERO_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(parquet STATIC IMPORTED)
++ set_target_properties(parquet
++ PROPERTIES IMPORTED_LOCATION
"${PAIMON_EXTERNAL_PARQUET_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ add_library(arrow_bundled_dependencies STATIC IMPORTED)
++ set_target_properties(arrow_bundled_dependencies
++ PROPERTIES IMPORTED_LOCATION
++
"${PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB}"
++ INTERFACE_INCLUDE_DIRECTORIES
++ "${ARROW_INCLUDE_DIR}")
++
++ target_link_libraries(arrow_acero INTERFACE arrow)
++
++ target_link_libraries(arrow_dataset INTERFACE arrow_acero)
++
++ target_link_libraries(arrow
++ INTERFACE zstd
++ snappy
++ lz4
++ zlib
++ arrow_bundled_dependencies)
++
++ target_link_libraries(parquet
++ INTERFACE zstd snappy lz4 zlib
arrow_bundled_dependencies
++ arrow_dataset)
++ else()
++ message(STATUS "Building Arrow from source")
+
+ get_target_property(ARROW_SNAPPY_INCLUDE_DIR snappy
INTERFACE_INCLUDE_DIRECTORIES)
+ get_filename_component(ARROW_SNAPPY_ROOT "${ARROW_SNAPPY_INCLUDE_DIR}"
DIRECTORY)
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1121,6 +1121,7 @@ macro(build_arrow)
+ zlib
+ arrow_bundled_dependencies
+ arrow_dataset)
++ endif()
+
+ endmacro(build_arrow)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -326,10 +326,10 @@ if(PAIMON_ENABLE_LUMINA)
+ include_directories("${CMAKE_SOURCE_DIR}/third_party/lumina/include")
+ endif()
+
++include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
+ include_directories(SYSTEM ${TBB_INCLUDE_DIR})
+
+-include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ add_compile_definitions("GLOG_USE_GLOG_EXPORT")
+
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+diff --git a/src/paimon/common/logging/logging.cpp
b/src/paimon/common/logging/logging.cpp
+--- a/src/paimon/common/logging/logging.cpp
++++ b/src/paimon/common/logging/logging.cpp
+@@ -83,7 +83,7 @@ std::unique_ptr<Logger> Logger::GetLogger(const std::string&
path) {
+ }
+ std::unique_lock<std::shared_mutex> ulock(getRegistryLock());
+ if (!google::IsGoogleLoggingInitialized()) {
+- google::InitGoogleLogging(program_invocation_name);
++ google::InitGoogleLogging("paimon-cpp");
+ }
+ return std::make_unique<GlogAdaptor>();
+ }
+
+diff --git a/src/paimon/common/memory/memory_pool.cpp
b/src/paimon/common/memory/memory_pool.cpp
+--- a/src/paimon/common/memory/memory_pool.cpp
++++ b/src/paimon/common/memory/memory_pool.cpp
+@@ -55,7 +55,7 @@ void* MemoryPoolImpl::Malloc(uint64_t size, uint64_t
alignment) {
+ return memptr;
+ }
+
+-void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size,
size_t alignment) {
++void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size,
uint64_t alignment) {
+ if (alignment == 0) {
+ void* memptr = ::realloc(p, new_size);
+ total_allocated_size.fetch_add(new_size - old_size);
+
+diff --git a/src/paimon/format/blob/blob_format_writer.cpp
b/src/paimon/format/blob/blob_format_writer.cpp
+--- a/src/paimon/format/blob/blob_format_writer.cpp
++++ b/src/paimon/format/blob/blob_format_writer.cpp
+@@ -138,7 +138,8 @@ Status BlobFormatWriter::WriteBlob(std::string_view
blob_data) {
+ }
+ PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, in->Length());
+ uint64_t total_read_length = 0;
+- uint32_t read_len = std::min(file_length, tmp_buffer_->size());
++ uint32_t read_len =
++ static_cast<uint32_t>(std::min<uint64_t>(file_length,
tmp_buffer_->size()));
+ while (read_len > 0) {
+ PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len,
in->Read(tmp_buffer_->data(), read_len));
+ if (static_cast<uint32_t>(actual_read_len) != read_len) {
+@@ -149,7 +150,8 @@ Status BlobFormatWriter::WriteBlob(std::string_view
blob_data) {
+ }
+ PAIMON_RETURN_NOT_OK(WriteWithCrc32(tmp_buffer_->data(),
actual_read_len));
+ total_read_length += actual_read_len;
+- read_len = std::min(file_length - total_read_length,
tmp_buffer_->size());
++ read_len = static_cast<uint32_t>(
++ std::min<uint64_t>(file_length - total_read_length,
tmp_buffer_->size()));
+ }
+
+ // write bin length
+
+--- a/cmake_modules/arrow.diff
++++ b/cmake_modules/arrow.diff
+@@ -196,3 +196,29 @@
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
++diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++index 9df922afa2..5c8b3d4d07 100644
++--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
++ REQUIRED_VERSION
++ 0.11.0)
++
++- string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
+++ if(NOT Thrift_VERSION)
+++ if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
+++ set(Thrift_VERSION "${thrift_PC_VERSION}")
+++ elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
+++ set(Thrift_VERSION "${ThriftAlt_VERSION}")
+++ elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
+++ set(Thrift_VERSION "${THRIFT_VERSION}")
+++ endif()
+++ endif()
+++ if(NOT Thrift_VERSION)
+++ message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift
dependency")
+++ endif()
+++
+++ string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
++ list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
++ list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
++ list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index 23eea97a349..b61978478c5 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -140,10 +140,11 @@ LZO2_SOURCE=lzo-2.10
LZO2_MD5SUM="39d3f3f9c55c87b1e5d6888e1420f4b5"
# rapidjson
-RAPIDJSON_DOWNLOAD="https://github.com/Tencent/rapidjson/archive/1a803826f1197b5e30703afe4b9c0e7dd48074f5.zip"
-RAPIDJSON_NAME=rapidjson-1a803826f1197b5e30703afe4b9c0e7dd48074f5.zip
-RAPIDJSON_SOURCE=rapidjson-1a803826f1197b5e30703afe4b9c0e7dd48074f5
-RAPIDJSON_MD5SUM="f2212a77e055a15501477f1e390007ea"
+# Updated to match paimon-cpp version (commit
232389d4f1012dddec4ef84861face2d2ba85709)
+RAPIDJSON_DOWNLOAD="https://github.com/miloyip/rapidjson/archive/232389d4f1012dddec4ef84861face2d2ba85709.tar.gz"
+RAPIDJSON_NAME=rapidjson-232389d4f1012dddec4ef84861face2d2ba85709.tar.gz
+RAPIDJSON_SOURCE=rapidjson-232389d4f1012dddec4ef84861face2d2ba85709
+RAPIDJSON_MD5SUM="577d3495a07b66fcd4a2866c93831bc4"
# curl
CURL_DOWNLOAD="https://curl.se/download/curl-8.2.1.tar.gz"
@@ -552,6 +553,14 @@ PUGIXML_NAME=pugixml-1.15.tar.gz
PUGIXML_SOURCE=pugixml-1.15
PUGIXML_MD5SUM="3b894c29455eb33a40b165c6e2de5895"
+# paimon-cpp
+# Using git clone since there's no official release yet
+# We'll use a specific commit or tag for reproducibility
+PAIMON_CPP_GIT_URL="https://github.com/alibaba/paimon-cpp.git"
+PAIMON_CPP_GIT_TAG="0a4f4e2e7967fdb0be180711bbe581a18eeeb2dd"
+PAIMON_CPP_NAME=paimon-cpp
+PAIMON_CPP_SOURCE=paimon-cpp
+
# all thirdparties which need to be downloaded is set in array TP_ARCHIVES
export TP_ARCHIVES=(
'LIBEVENT'
@@ -634,6 +643,7 @@ export TP_ARCHIVES=(
'ICU'
'JINDOFS'
'PUGIXML'
+ 'PAIMON_CPP'
)
if [[ "$(uname -s)" == 'Darwin' ]]; then
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]