This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 5993dbd9148 branch-4.1: [chore](thirdparty) Integrate paimon-cpp into 
thirdparty build system #60296 (#61369)
5993dbd9148 is described below

commit 5993dbd91487837ac89ac2b14c99901ac7057c37
Author: Chenjunwei <[email protected]>
AuthorDate: Tue Mar 17 09:18:42 2026 +0800

    branch-4.1: [chore](thirdparty) Integrate paimon-cpp into thirdparty build 
system #60296 (#61369)
    
    Cherry-pick #60296 to branch-4.1
    
    ### What problem does this PR solve?
    
    - **Related PR**: #60296
    
    Integrate paimon-cpp into thirdparty build system.
    
    ### Cherry-pick commit
    
    - `2658f896441` - [chore](thirdparty) Integrate paimon-cpp into
    thirdparty build system (#60296)
    
    ---------
    
    Co-authored-by: Socrates <[email protected]>
---
 .github/workflows/build-thirdparty.yml             |  45 ++-
 thirdparty/build-thirdparty.sh                     | 215 ++++++++++++-
 thirdparty/download-thirdparty.sh                  |  98 ++++++
 thirdparty/paimon-cpp-cache.cmake                  | 136 +++++++++
 .../patches/apache-arrow-17.0.0-paimon.patch       | 224 ++++++++++++++
 .../paimon-cpp-buildutils-static-deps.patch        | 331 +++++++++++++++++++++
 thirdparty/vars.sh                                 |  18 +-
 7 files changed, 1049 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build-thirdparty.yml 
b/.github/workflows/build-thirdparty.yml
index 6e7a888f416..cc707cf516b 100644
--- a/.github/workflows/build-thirdparty.yml
+++ b/.github/workflows/build-thirdparty.yml
@@ -65,16 +65,17 @@ jobs:
         run: |
           git clone -b v7 https://github.com/easimon/maximize-build-space
 
-      - name: Maximize build space
-        uses: ./maximize-build-space
-        with:
-            root-reserve-mb: 4096
-            swap-size-mb: 8192
-            remove-dotnet: 'true'
-            remove-android: 'true'
-            remove-haskell: 'true'
-            remove-codeql: 'true'
-            remove-docker-images: 'true'
+      #- name: Maximize build space
+      #  uses: ./maximize-build-space
+      #  with:
+      #      root-reserve-mb: 4096
+      #      temp-reserve-mb: 4096
+      #      swap-size-mb: 8192
+      #      remove-dotnet: 'true'
+      #      remove-android: 'true'
+      #      remove-haskell: 'true'
+      #      remove-codeql: 'true'
+      #      remove-docker-images: 'true'
 
       - name: Checkout ${{ github.ref }}
         uses: actions/checkout@v4
@@ -135,8 +136,18 @@ jobs:
           export CMAKE_POLICY_VERSION_MINIMUM=3.10
           export CUSTOM_CMAKE="/usr/local/bin/cmake"
 
+          #export TMPDIR=/home/runner/work/doris/doris/.tmp
+          #export TMP=$TMPDIR
+          #export TEMP=$TMPDIR
+          #mkdir -p $TMPDIR
+
+          df -h
+          echo $TMPDIR
+          echo $RUNNER_TEMP
+
           cd thirdparty
-          ./build-thirdparty.sh -j "$(nproc)"
+          #./build-thirdparty.sh -j "$(nproc)"
+          ./build-thirdparty.sh -j 2
 
   build_macos:
     name: Build Third Party Libraries (macOS)
@@ -195,7 +206,12 @@ jobs:
           export CMAKE_POLICY_VERSION_MINIMUM=3.10
           export CUSTOM_CMAKE="/usr/local/bin/cmake"
 
+          df -h
+          echo $TMPDIR
+          echo $RUNNER_TEMP
+
           cd thirdparty
+          #./build-thirdparty.sh -j "$(nproc)"
           ./build-thirdparty.sh -j "$(nproc)"
 
   build_macos_arm64:
@@ -255,6 +271,11 @@ jobs:
           export CMAKE_POLICY_VERSION_MINIMUM=3.10
           export CUSTOM_CMAKE="/usr/local/bin/cmake"
 
+          df -h
+          echo $TMPDIR
+          echo $RUNNER_TEMP
+
           cd thirdparty
-          ./build-thirdparty.sh -j "$(nproc)"
+          #./build-thirdparty.sh -j "$(nproc)"
+          ./build-thirdparty.sh -j 2
 
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index 7e440719bd7..18a6f5ac4de 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -638,7 +638,7 @@ build_lz4() {
 build_crc32c() {
     check_if_source_exist "${CRC32C_SOURCE}"
     cd "${TP_SOURCE_DIR}/${CRC32C_SOURCE}"
-    
+
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
@@ -1090,6 +1090,10 @@ build_arrow() {
         -DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON 
-DARROW_USE_GLOG=ON \
         -DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON 
-DARROW_JSON=ON \
         -DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \
+        -DARROW_COMPUTE=ON \
+        -DARROW_FILESYSTEM=ON \
+        -DARROW_DATASET=ON \
+        -DARROW_ACERO=ON \
         -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DCMAKE_INSTALL_LIBDIR=lib64 \
         -DARROW_BOOST_USE_SHARED=OFF \
@@ -1137,6 +1141,8 @@ build_arrow() {
     cp -rf ./brotli_ep/src/brotli_ep-install/lib/libbrotlicommon-static.a 
"${TP_INSTALL_DIR}/lib64/libbrotlicommon.a"
     strip_lib libarrow.a
     strip_lib libparquet.a
+    strip_lib libarrow_dataset.a
+    strip_lib libarrow_acero.a
 }
 
 # abseil
@@ -1804,7 +1810,7 @@ build_libdeflate() {
     cd "${BUILD_DIR}"
 
     "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
-    -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DCMAKE_BUILD_TYPE=Release ..
+        -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DCMAKE_BUILD_TYPE=Release 
..
     "${BUILD_SYSTEM}" -j "${PARALLEL}"
     "${BUILD_SYSTEM}" install
 }
@@ -1979,6 +1985,89 @@ build_pugixml() {
     cp "${TP_SOURCE_DIR}/${PUGIXML_SOURCE}/src/pugiconfig.hpp" 
"${TP_INSTALL_DIR}/include/"
 }
 
+# paimon-cpp
+build_paimon_cpp() {
+    check_if_source_exist "${PAIMON_CPP_SOURCE}"
+    cd "${TP_SOURCE_DIR}/${PAIMON_CPP_SOURCE}"
+
+    rm -rf "${BUILD_DIR}"
+    mkdir -p "${BUILD_DIR}"
+    cd "${BUILD_DIR}"
+
+    # Darwin doesn't build GNU libunwind in this script, so don't force 
-lunwind there.
+    local paimon_linker_flags="-L${TP_LIB_DIR} -lbrotlienc -lbrotlidec 
-lbrotlicommon -llzma"
+    if [[ "${KERNEL}" != 'Darwin' ]]; then
+        paimon_linker_flags="${paimon_linker_flags} -lunwind"
+    fi
+
+    CXXFLAGS="-Wno-nontrivial-memcall" \
+    "${CMAKE_CMD}" -C "${TP_DIR}/paimon-cpp-cache.cmake" \
+        -G "${GENERATOR}" \
+        -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
+        -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
+        -DPAIMON_BUILD_SHARED=OFF \
+        -DPAIMON_BUILD_STATIC=ON \
+        -DPAIMON_BUILD_TESTS=OFF \
+        -DPAIMON_ENABLE_ORC=ON \
+        -DPAIMON_ENABLE_AVRO=OFF \
+        -DPAIMON_ENABLE_LANCE=OFF \
+        -DPAIMON_ENABLE_JINDO=OFF \
+        -DPAIMON_ENABLE_LUMINA=OFF \
+        -DPAIMON_ENABLE_LUCENE=OFF \
+        -DCMAKE_EXE_LINKER_FLAGS="${paimon_linker_flags}" \
+        -DCMAKE_SHARED_LINKER_FLAGS="${paimon_linker_flags}" \
+        ..
+    "${BUILD_SYSTEM}" -j "${PARALLEL}"
+    "${BUILD_SYSTEM}" install
+
+    # Install paimon-cpp internal dependencies with renamed versions
+    # These libraries are built but not installed by default
+    echo "Installing paimon-cpp internal dependencies..."
+
+    # Arrow deps: When PAIMON_USE_EXTERNAL_ARROW=ON (Plan B), paimon-cpp
+    # reuses Doris's Arrow and does NOT build arrow_ep, so the paimon_deps
+    # directory is not needed.  When building its own Arrow (legacy), copy
+    # arrow artefacts into an isolated directory to avoid clashing with Doris.
+    local paimon_deps_dir="${TP_INSTALL_DIR}/paimon-cpp/lib64/paimon_deps"
+    if [ -d "arrow_ep-install/lib" ]; then
+        mkdir -p "${paimon_deps_dir}"
+        for paimon_arrow_dep in \
+            libarrow.a \
+            libarrow_filesystem.a \
+            libarrow_dataset.a \
+            libarrow_acero.a \
+            libparquet.a; do
+            if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
+                cp -v "arrow_ep-install/lib/${paimon_arrow_dep}" 
"${paimon_deps_dir}/${paimon_arrow_dep}"
+            fi
+        done
+    else
+        echo "  arrow_ep-install not found (PAIMON_USE_EXTERNAL_ARROW=ON?) – 
skipping paimon_deps Arrow copy"
+    fi
+
+    # Install roaring_bitmap, renamed to avoid conflict with Doris's 
croaringbitmap
+    if [ -f "release/libroaring_bitmap.a" ]; then
+        cp -v "release/libroaring_bitmap.a" 
"${TP_INSTALL_DIR}/lib64/libroaring_bitmap_paimon.a"
+    fi
+
+    # Install xxhash, renamed to avoid conflict with Doris's xxhash
+    if [ -f "release/libxxhash.a" ]; then
+        cp -v "release/libxxhash.a" 
"${TP_INSTALL_DIR}/lib64/libxxhash_paimon.a"
+    fi
+
+    # Install fmt v11 (from fmt_ep-install directory, renamed to avoid 
conflict with Doris's fmt v7)
+    if [ -f "fmt_ep-install/lib/libfmt.a" ]; then
+        cp -v "fmt_ep-install/lib/libfmt.a" 
"${TP_INSTALL_DIR}/lib64/libfmt_paimon.a"
+    fi
+
+    # Install tbb (from tbb_ep-install directory, renamed to avoid conflict 
with Doris's tbb)
+    if [ -f "tbb_ep-install/lib/libtbb.a" ]; then
+        cp -v "tbb_ep-install/lib/libtbb.a" 
"${TP_INSTALL_DIR}/lib64/libtbb_paimon.a"
+    fi
+
+    echo "Paimon-cpp internal dependencies installed successfully"
+}
+
 if [[ "${#packages[@]}" -eq 0 ]]; then
     packages=(
         jindofs
@@ -2051,6 +2140,7 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
         brotli
         icu
         pugixml
+        paimon_cpp
     )
     if [[ "$(uname -s)" == 'Darwin' ]]; then
         read -r -a packages <<<"binutils gettext ${packages[*]}"
@@ -2060,6 +2150,122 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
     fi
 fi
 
+# Map a package name to its source directory variable(s) and remove them to 
free disk space.
+# This is called after each package is built and installed successfully.
+cleanup_package_source() {
+    local pkg="$1"
+    local src_var
+    local src_dir
+
+    # Map package name to the uppercase *_SOURCE variable name
+    case "${pkg}" in
+        libevent)        src_var="LIBEVENT_SOURCE" ;;
+        openssl)         src_var="OPENSSL_SOURCE" ;;
+        thrift)          src_var="THRIFT_SOURCE" ;;
+        protobuf)        src_var="PROTOBUF_SOURCE" ;;
+        gflags)          src_var="GFLAGS_SOURCE" ;;
+        glog)            src_var="GLOG_SOURCE" ;;
+        gtest)           src_var="GTEST_SOURCE" ;;
+        rapidjson)       src_var="RAPIDJSON_SOURCE" ;;
+        snappy)          src_var="SNAPPY_SOURCE" ;;
+        gperftools)      src_var="GPERFTOOLS_SOURCE" ;;
+        zlib)            src_var="ZLIB_SOURCE" ;;
+        crc32c)          src_var="CRC32C_SOURCE" ;;
+        lz4)             src_var="LZ4_SOURCE" ;;
+        bzip)            src_var="BZIP_SOURCE" ;;
+        lzo2)            src_var="LZO2_SOURCE" ;;
+        zstd)            src_var="ZSTD_SOURCE" ;;
+        #boost)           src_var="BOOST_SOURCE" ;; // boost is used for mysql 
later
+        abseil)          src_var="ABSEIL_SOURCE" ;;
+        curl)            src_var="CURL_SOURCE" ;;
+        re2)             src_var="RE2_SOURCE" ;;
+        hyperscan)
+            # hyperscan also builds ragel, clean both
+            if [[ -n "${RAGEL_SOURCE}" && -d 
"${TP_SOURCE_DIR}/${RAGEL_SOURCE}" ]]; then
+                echo "Cleaning up source: ${RAGEL_SOURCE}"
+                rm -rf "${TP_SOURCE_DIR}/${RAGEL_SOURCE}"
+            fi
+            src_var="HYPERSCAN_SOURCE"
+            ;;
+        mysql)           src_var="MYSQL_SOURCE" ;;
+        odbc)            src_var="ODBC_SOURCE" ;;
+        leveldb)         src_var="LEVELDB_SOURCE" ;;
+        brpc)            src_var="BRPC_SOURCE" ;;
+        rocksdb)         src_var="ROCKSDB_SOURCE" ;;
+        cyrus_sasl)      src_var="CYRUS_SASL_SOURCE" ;;
+        librdkafka)      src_var="LIBRDKAFKA_SOURCE" ;;
+        flatbuffers)     src_var="FLATBUFFERS_SOURCE" ;;
+        arrow)           src_var="ARROW_SOURCE" ;;
+        brotli)          src_var="BROTLI_SOURCE" ;;
+        cares)           src_var="CARES_SOURCE" ;;
+        grpc)            src_var="GRPC_SOURCE" ;;
+        s2)              src_var="S2_SOURCE" ;;
+        bitshuffle)      src_var="BITSHUFFLE_SOURCE" ;;
+        croaringbitmap)  src_var="CROARINGBITMAP_SOURCE" ;;
+        fmt)             src_var="FMT_SOURCE" ;;
+        parallel_hashmap) src_var="PARALLEL_HASHMAP_SOURCE" ;;
+        orc)             src_var="ORC_SOURCE" ;;
+        cctz)            src_var="CCTZ_SOURCE" ;;
+        jemalloc_doris)  src_var="JEMALLOC_DORIS_SOURCE" ;;
+        libunwind)       src_var="LIBUNWIND_SOURCE" ;;
+        benchmark)       src_var="BENCHMARK_SOURCE" ;;
+        simdjson)        src_var="SIMDJSON_SOURCE" ;;
+        nlohmann_json)   src_var="NLOHMANN_JSON_SOURCE" ;;
+        libbacktrace)    src_var="LIBBACKTRACE_SOURCE" ;;
+        sse2neon)        src_var="SSE2NEON_SOURCE" ;;
+        xxhash)          src_var="XXHASH_SOURCE" ;;
+        concurrentqueue) src_var="CONCURRENTQUEUE_SOURCE" ;;
+        fast_float)      src_var="FAST_FLOAT_SOURCE" ;;
+        hadoop_libs)     src_var="HADOOP_LIBS_SOURCE" ;;
+        hadoop_libs_3_4) src_var="HADOOP_LIBS_3_4_SOURCE" ;;
+        avx2neon)        src_var="AVX2NEON_SOURCE" ;;
+        libdeflate)      src_var="LIBDEFLATE_SOURCE" ;;
+        streamvbyte)     src_var="STREAMVBYTE_SOURCE" ;;
+        ali_sdk)
+            # ali_sdk internally builds jsoncpp and libuuid, clean all three
+            for dep_var in JSONCPP_SOURCE LIBUUID_SOURCE ALI_SDK_SOURCE; do
+                dep_dir="${!dep_var}"
+                if [[ -n "${dep_dir}" && -d "${TP_SOURCE_DIR}/${dep_dir}" ]]; 
then
+                    echo "Cleaning up source: ${dep_dir}"
+                    rm -rf "${TP_SOURCE_DIR}/${dep_dir}"
+                fi
+            done
+            return
+            ;;
+        base64)          src_var="BASE64_SOURCE" ;;
+        azure)           src_var="AZURE_SOURCE" ;;
+        dragonbox)       src_var="DRAGONBOX_SOURCE" ;;
+        icu)             src_var="ICU_SOURCE" ;;
+        jindofs)         src_var="JINDOFS_SOURCE" ;;
+        pugixml)         src_var="PUGIXML_SOURCE" ;;
+        paimon_cpp)      src_var="PAIMON_CPP_SOURCE" ;;
+        aws_sdk)         src_var="AWS_SDK_SOURCE" ;;
+        lzma)            src_var="LZMA_SOURCE" ;;
+        xml2)            src_var="XML2_SOURCE" ;;
+        idn)             src_var="IDN_SOURCE" ;;
+        gsasl)           src_var="GSASL_SOURCE" ;;
+        krb5)            src_var="KRB5_SOURCE" ;;
+        hdfs3)           src_var="HDFS3_SOURCE" ;;
+        libdivide)       src_var="LIBDIVIDE_SOURCE" ;;
+        binutils)        src_var="BINUTILS_SOURCE" ;;
+        gettext)         src_var="GETTEXT_SOURCE" ;;
+        # Header-only files, skip cleanup
+        pdqsort|timsort|tsan_header|js_and_css)
+            return
+            ;;
+        *)
+            echo "Warning: no source mapping for package '${pkg}', skipping 
cleanup"
+            return
+            ;;
+    esac
+
+    src_dir="${!src_var}"
+    if [[ -n "${src_dir}" && -d "${TP_SOURCE_DIR}/${src_dir}" ]]; then
+        echo "Cleaning up source: ${src_dir}"
+        rm -rf "${TP_SOURCE_DIR}/${src_dir}"
+    fi
+}
+
 for package in "${packages[@]}"; do
     if [[ "${package}" == "${start_package}" ]]; then
         PACKAGE_FOUND=1
@@ -2067,6 +2273,11 @@ for package in "${packages[@]}"; do
     if [[ "${CONTINUE}" -eq 0 ]] || [[ "${PACKAGE_FOUND}" -eq 1 ]]; then
         command="build_${package}"
         ${command}
+        cd "${TP_DIR}"
+        cleanup_package_source "${package}"
+        echo "debug after clean: ${package}"
+        df -h
+        du -sh "${TP_DIR}"
     fi
 done
 
diff --git a/thirdparty/download-thirdparty.sh 
b/thirdparty/download-thirdparty.sh
index f3b999115b7..d85b86704d3 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -103,6 +103,18 @@ md5sum_func() {
     return 0
 }
 
+is_git_package() {
+    local TP_ARCH="$1"
+    local GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+    [[ -n "${!GIT_URL_VAR}" ]]
+}
+
+git_url_for() {
+    local TP_ARCH="$1"
+    local GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+    echo "${!GIT_URL_VAR}"
+}
+
 # return 0 if download succeed.
 # return 1 if not.
 download_func() {
@@ -159,6 +171,10 @@ download_func() {
 # download thirdparty archives
 echo "===== Downloading thirdparty archives..."
 for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+    if is_git_package "${TP_ARCH}"; then
+        echo "Skip downloading ${TP_ARCH} (git repo: $(git_url_for 
"${TP_ARCH}"))"
+        continue
+    fi
     NAME="${TP_ARCH}_NAME"
     MD5SUM="${TP_ARCH}_MD5SUM"
     if [[ -z "${REPOSITORY_URL}" ]]; then
@@ -184,6 +200,9 @@ echo "===== Downloading thirdparty archives...done"
 # check if all tp archives exists
 echo "===== Checking all thirdpart archives..."
 for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+    if is_git_package "${TP_ARCH}"; then
+        continue
+    fi
     NAME="${TP_ARCH}_NAME"
     if [[ ! -r "${TP_SOURCE_DIR}/${!NAME}" ]]; then
         echo "Failed to fetch ${!NAME}"
@@ -201,6 +220,9 @@ SUFFIX_XZ="\.tar\.xz$"
 SUFFIX_ZIP="\.zip$"
 SUFFIX_BZ2="\.tar\.bz2$"
 for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+    if is_git_package "${TP_ARCH}"; then
+        continue
+    fi
     NAME="${TP_ARCH}_NAME"
     SOURCE="${TP_ARCH}_SOURCE"
 
@@ -240,6 +262,57 @@ for TP_ARCH in "${TP_ARCHIVES[@]}"; do
 done
 echo "===== Unpacking all thirdparty archives...done"
 
+# Clone and checkout git repositories
+echo "===== Cloning git repositories..."
+for TP_ARCH in "${TP_ARCHIVES[@]}"; do
+    if ! is_git_package "${TP_ARCH}"; then
+        continue
+    fi
+
+    GIT_URL_VAR="${TP_ARCH}_GIT_URL"
+    GIT_TAG_VAR="${TP_ARCH}_GIT_TAG"
+    SOURCE_VAR="${TP_ARCH}_SOURCE"
+    
+    GIT_URL="${!GIT_URL_VAR}"
+    GIT_TAG="${!GIT_TAG_VAR}"
+    SOURCE_DIR="${TP_SOURCE_DIR}/${!SOURCE_VAR}"
+
+    if [[ -z "${GIT_URL}" ]] || [[ -z "${GIT_TAG}" ]] || [[ -z 
"${!SOURCE_VAR}" ]]; then
+        echo "Warning: ${TP_ARCH} git configuration incomplete, skipping"
+        continue
+    fi
+
+    if [[ ! -d "${SOURCE_DIR}" ]]; then
+        echo "Cloning ${TP_ARCH} from ${GIT_URL}..."
+        cd "${TP_SOURCE_DIR}"
+        if ! git clone "${GIT_URL}" "${!SOURCE_VAR}"; then
+            echo "Failed to clone ${TP_ARCH}"
+            exit 1
+        fi
+    else
+        echo "${TP_ARCH} repository already exists, updating..."
+        cd "${SOURCE_DIR}"
+        git fetch origin || true
+    fi
+
+    cd "${SOURCE_DIR}"
+    if ! git checkout "${GIT_TAG}" 2>/dev/null; then
+        echo "Tag ${GIT_TAG} not found, trying to fetch..."
+        is_shallow="$(git rev-parse --is-shallow-repository 2>/dev/null || 
echo false)"
+        if [[ "${is_shallow}" == "true" ]]; then
+            git fetch --unshallow origin || git fetch --depth=2147483647 origin
+        else
+            git fetch origin
+        fi
+        if ! git checkout "${GIT_TAG}"; then
+            echo "Failed to checkout ${GIT_TAG} for ${TP_ARCH}"
+            exit 1
+        fi
+    fi
+    echo "Successfully checked out ${GIT_TAG} for ${TP_ARCH}"
+done
+echo "===== Cloning git repositories...done"
+
 echo "===== Patching thirdparty archives..."
 
 
###################################################################################
@@ -352,6 +425,16 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
         fi
         cd -
     fi
+    if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-17.0.0" ]]; then
+        cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
+        if [[ ! -f "${PATCHED_MARK}" ]]; then
+            # Paimon-cpp parquet patches: row-group-aware batch reader, 
max_row_group_size,
+            # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty 
fix.
+            patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+            touch "${PATCHED_MARK}"
+        fi
+        cd -
+    fi
     echo "Finished patching ${ARROW_SOURCE}"
 fi
 
@@ -612,6 +695,21 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " AZURE " ]]; then
     echo "Finished patching ${AZURE_SOURCE}"
 fi
 
+# patch paimon-cpp
+if [[ " ${TP_ARCHIVES[*]} " =~ " PAIMON_CPP " ]]; then
+    cd "${TP_SOURCE_DIR}/${PAIMON_CPP_SOURCE}"
+    if [[ ! -f "${PATCHED_MARK}" ]]; then
+        if patch -p1 -N --batch --dry-run 
<"${TP_PATCH_DIR}/paimon-cpp-buildutils-static-deps.patch" >/dev/null 2>&1; then
+            patch -p1 -N --batch 
<"${TP_PATCH_DIR}/paimon-cpp-buildutils-static-deps.patch"
+        else
+            echo "Skip paimon-cpp patch: already applied or not applicable for 
current source"
+        fi
+        touch "${PATCHED_MARK}"
+    fi
+    cd -
+    echo "Finished patching ${PAIMON_CPP_SOURCE}"
+fi
+
 if [[ " ${TP_ARCHIVES[*]} " =~ " CCTZ " ]] ; then
     cd $TP_SOURCE_DIR/$CCTZ_SOURCE
     if [[ ! -f "$PATCHED_MARK" ]] ; then
diff --git a/thirdparty/paimon-cpp-cache.cmake 
b/thirdparty/paimon-cpp-cache.cmake
new file mode 100644
index 00000000000..dbebd94a0cc
--- /dev/null
+++ b/thirdparty/paimon-cpp-cache.cmake
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# CMake Initial Cache for paimon-cpp
+# Configures paimon-cpp to reuse selected Doris third-party libraries
+# Usage: cmake -C paimon-cpp-cache.cmake ...
+
+# Get the Doris thirdparty installation directory from environment
+set(DORIS_THIRDPARTY_DIR "$ENV{TP_INSTALL_DIR}" CACHE PATH "Doris thirdparty 
install directory")
+
+if(NOT DORIS_THIRDPARTY_DIR)
+    message(FATAL_ERROR "TP_INSTALL_DIR environment variable must be set")
+endif()
+
+message(STATUS "Using Doris thirdparty libraries from: 
${DORIS_THIRDPARTY_DIR}")
+
+# Set CMAKE_PREFIX_PATH to help find_package locate our libraries
+set(CMAKE_PREFIX_PATH "${DORIS_THIRDPARTY_DIR};${CMAKE_PREFIX_PATH}" CACHE 
STRING "Search path for find_package")
+
+# Library and include paths
+set(DORIS_LIB_DIR "${DORIS_THIRDPARTY_DIR}/lib" CACHE PATH "Doris library 
directory")
+set(DORIS_INCLUDE_DIR "${DORIS_THIRDPARTY_DIR}/include" CACHE PATH "Doris 
include directory")
+
+# ============================================================================
+# ZLIB - Reuse from Doris (version 1.3.1)
+# ============================================================================
+set(ZLIB_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "ZLIB root directory")
+set(ZLIB_LIBRARY "${DORIS_LIB_DIR}/libz.a" CACHE FILEPATH "ZLIB library")
+set(ZLIB_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "ZLIB include 
directory")
+
+# ============================================================================
+# ZSTD - Reuse from Doris (version 1.5.7)
+# ============================================================================
+set(ZSTD_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "ZSTD root directory")
+set(ZSTD_LIBRARY "${DORIS_LIB_DIR}/libzstd.a" CACHE FILEPATH "ZSTD library")
+set(ZSTD_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "ZSTD include 
directory")
+
+# ============================================================================
+# LZ4 - Reuse from Doris (version 1.9.4)
+# ============================================================================
+set(LZ4_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "LZ4 root directory")
+set(LZ4_LIBRARY "${DORIS_LIB_DIR}/liblz4.a" CACHE FILEPATH "LZ4 library")
+set(LZ4_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "LZ4 include directory")
+
+# ============================================================================
+# glog - NOT reused from Doris
+# paimon-cpp's build_glog() unconditionally calls externalproject_add() to
+# build glog 0.7.1.  Any GLOG_ROOT/GLOG_LIBRARY/GLOG_INCLUDE_DIR set here
+# would be overwritten by that macro, so we skip them entirely.
+# ============================================================================
+
+# ============================================================================
+# Arrow - Reuse from Doris (Doris Arrow now includes 
COMPUTE/DATASET/ACERO/FILESYSTEM)
+# Doris's Arrow 17.0.0 is built with the full module set that paimon-cpp
+# needs, so we skip paimon-cpp's internal externalproject_add(arrow_ep ...).
+# ============================================================================
+set(PAIMON_USE_EXTERNAL_ARROW ON CACHE BOOL "Use pre-built Arrow from Doris 
instead of building from source")
+
+set(DORIS_LIB64_DIR "${DORIS_THIRDPARTY_DIR}/lib64" CACHE PATH "Doris lib64 
directory")
+
+set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Arrow 
include directory")
+set(PAIMON_EXTERNAL_ARROW_LIB "${DORIS_LIB64_DIR}/libarrow.a" CACHE FILEPATH 
"Arrow core library")
+set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "${DORIS_LIB64_DIR}/libarrow_dataset.a" 
CACHE FILEPATH "Arrow Dataset library")
+set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "${DORIS_LIB64_DIR}/libarrow_acero.a" 
CACHE FILEPATH "Arrow Acero library")
+set(PAIMON_EXTERNAL_PARQUET_LIB "${DORIS_LIB64_DIR}/libparquet.a" CACHE 
FILEPATH "Parquet library")
+set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB 
"${DORIS_LIB64_DIR}/libarrow_bundled_dependencies.a" CACHE FILEPATH "Arrow 
bundled dependencies library")
+
+# Protobuf, Thrift - still built separately by paimon-cpp
+
+# ============================================================================
+# Snappy - Reuse from Doris
+# ============================================================================
+set(Snappy_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "Snappy root directory")
+set(SNAPPY_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "Snappy root directory 
(legacy)")
+set(SNAPPY_LIBRARY "${DORIS_LIB_DIR}/libsnappy.a" CACHE FILEPATH "Snappy 
library")
+set(SNAPPY_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Snappy include 
directory")
+
+# ============================================================================
+# Build configuration
+# ============================================================================
+set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Build with -fPIC")
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type")
+
+# Symbol visibility control to prevent conflicts with Doris
+# paimon-cpp builds Arrow/ORC/etc with hidden symbols to avoid conflicts
+set(CMAKE_CXX_VISIBILITY_PRESET "hidden" CACHE STRING "Hide C++ symbols by 
default")
+set(CMAKE_C_VISIBILITY_PRESET "hidden" CACHE STRING "Hide C symbols by 
default")
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON CACHE BOOL "Hide inline function 
symbols")
+
+# Verify that required libraries exist
+if(NOT EXISTS "${ZLIB_LIBRARY}")
+    message(FATAL_ERROR "ZLIB library not found: ${ZLIB_LIBRARY}")
+endif()
+if(NOT EXISTS "${ZSTD_LIBRARY}")
+    message(FATAL_ERROR "ZSTD library not found: ${ZSTD_LIBRARY}")
+endif()
+if(NOT EXISTS "${LZ4_LIBRARY}")
+    message(FATAL_ERROR "LZ4 library not found: ${LZ4_LIBRARY}")
+endif()
+if(NOT EXISTS "${SNAPPY_LIBRARY}")
+    message(FATAL_ERROR "Snappy library not found: ${SNAPPY_LIBRARY}")
+endif()
+
+message(STATUS "========================================")
+message(STATUS "Paimon-cpp Library Reuse Configuration")
+message(STATUS "========================================")
+message(STATUS "Reusing from Doris:")
+message(STATUS "  ✓ ZLIB, ZSTD, LZ4, Snappy")
+if(PAIMON_USE_EXTERNAL_ARROW)
+    message(STATUS "  ✓ Arrow, Parquet, Arrow Dataset, Arrow Acero (Plan B)")
+else()
+    message(STATUS "  ✗ Arrow (building separately, symbol visibility=hidden)")
+endif()
+message(STATUS "")
+message(STATUS "Building separately:")
+if(NOT PAIMON_USE_EXTERNAL_ARROW)
+    message(STATUS "  - Arrow, Protobuf, Thrift, ORC")
+else()
+    message(STATUS "  - Protobuf, Thrift, ORC")
+endif()
+message(STATUS "  - glog, RapidJSON, TBB")
+message(STATUS "========================================")
diff --git a/thirdparty/patches/apache-arrow-17.0.0-paimon.patch 
b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
new file mode 100644
index 00000000000..4e53117b79b
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
@@ -0,0 +1,224 @@
+diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
+index ec3890a41f..943f69bb6c 100644
+--- a/cpp/src/parquet/arrow/schema.cc
++++ b/cpp/src/parquet/arrow/schema.cc
+@@ -178,7 +178,7 @@ static Status GetTimestampMetadata(const 
::arrow::TimestampType& type,
+
+   // The user is explicitly asking for Impala int96 encoding, there is no
+   // logical type.
+-  if (arrow_properties.support_deprecated_int96_timestamps()) {
++  if (arrow_properties.support_deprecated_int96_timestamps() && target_unit 
== ::arrow::TimeUnit::NANO) {
+     *physical_type = ParquetType::INT96;
+     return Status::OK();
+   }
+
+diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
+index 285e2a5973..aa6f92f077 100644
+--- a/cpp/src/parquet/arrow/reader.cc
++++ b/cpp/src/parquet/arrow/reader.cc
+@@ -1013,25 +1013,32 @@ Status FileReaderImpl::GetRecordBatchReader(const 
std::vector<int>& row_groups,
+     return Status::OK();
+   }
+
+-  int64_t num_rows = 0;
++  std::vector<int64_t> num_rows;
+   for (int row_group : row_groups) {
+-    num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
++    
num_rows.push_back(parquet_reader()->metadata()->RowGroup(row_group)->num_rows());
+   }
+
+   using ::arrow::RecordBatchIterator;
++  int row_group_idx = 0;
+
+   // NB: This lambda will be invoked outside the scope of this call to
+   // `GetRecordBatchReader()`, so it must capture `readers` and 
`batch_schema` by value.
+   // `this` is a non-owning pointer so we are relying on the parent 
FileReader outliving
+   // this RecordBatchReader.
+   ::arrow::Iterator<RecordBatchIterator> batches = 
::arrow::MakeFunctionIterator(
+-      [readers, batch_schema, num_rows,
++      [readers, batch_schema, num_rows, row_group_idx,
+        this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+         ::arrow::ChunkedArrayVector columns(readers.size());
+
+-        // don't reserve more rows than necessary
+-        int64_t batch_size = std::min(properties().batch_size(), num_rows);
+-        num_rows -= batch_size;
++        int64_t batch_size = 0;
++        if (!num_rows.empty()) {
++          // don't reserve more rows than necessary
++          batch_size = std::min(properties().batch_size(), 
num_rows[row_group_idx]);
++          num_rows[row_group_idx] -= batch_size;
++          if (num_rows[row_group_idx] == 0 && (num_rows.size() - 1) != 
row_group_idx) {
++            row_group_idx++;
++          }
++        }
+
+         RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+             reader_properties_.use_threads(), 
static_cast<int>(readers.size()),
+diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
+index 4fd7ef1b47..87326a54f1 100644
+--- a/cpp/src/parquet/arrow/writer.cc
++++ b/cpp/src/parquet/arrow/writer.cc
+@@ -314,6 +314,14 @@ class FileWriterImpl : public FileWriter {
+     return Status::OK();
+   }
+
++  int64_t GetBufferedSize() override {
++    if (row_group_writer_ == nullptr) {
++      return 0;
++    }
++    return row_group_writer_->total_compressed_bytes() +
++      row_group_writer_->total_compressed_bytes_written();
++  }
++
+   Status Close() override {
+     if (!closed_) {
+       // Make idempotent
+@@ -418,10 +426,13 @@ class FileWriterImpl : public FileWriter {
+
+     // Max number of rows allowed in a row group.
+     const int64_t max_row_group_length = 
this->properties().max_row_group_length();
++    const int64_t max_row_group_size = 
this->properties().max_row_group_size();
+
+     // Initialize a new buffered row group writer if necessary.
+     if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
+-        row_group_writer_->num_rows() >= max_row_group_length) {
++        row_group_writer_->num_rows() >= max_row_group_length ||
++        (row_group_writer_->total_compressed_bytes_written() +
++         row_group_writer_->total_compressed_bytes() >= max_row_group_size)) {
+       RETURN_NOT_OK(NewBufferedRowGroup());
+     }
+
+diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
+index 4a1a033a7b..0f13d05e44 100644
+--- a/cpp/src/parquet/arrow/writer.h
++++ b/cpp/src/parquet/arrow/writer.h
+@@ -138,6 +138,9 @@ class PARQUET_EXPORT FileWriter {
+   /// option in this case.
+   virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) 
= 0;
+
++  /// \brief Return the buffered size in bytes.
++  virtual int64_t GetBufferedSize() = 0;
++
+   /// \brief Write the footer and close the file.
+   virtual ::arrow::Status Close() = 0;
+   virtual ~FileWriter();
+diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
+index 4d3acb491e..3906ff3c59 100644
+--- a/cpp/src/parquet/properties.h
++++ b/cpp/src/parquet/properties.h
+@@ -139,6 +139,7 @@ static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+ static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = 
kDefaultDataPageSize;
+ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+ static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
++static constexpr int64_t DEFAULT_MAX_ROW_GROUP_SIZE = 128 * 1024 * 1024;
+ static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+ static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
+@@ -232,6 +233,7 @@ class PARQUET_EXPORT WriterProperties {
+           dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
++          max_row_group_size_(DEFAULT_MAX_ROW_GROUP_SIZE),
+           pagesize_(kDefaultDataPageSize),
+           version_(ParquetVersion::PARQUET_2_6),
+           data_page_version_(ParquetDataPageVersion::V1),
+@@ -244,6 +246,7 @@ class PARQUET_EXPORT WriterProperties {
+           dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
+           write_batch_size_(properties.write_batch_size()),
+           max_row_group_length_(properties.max_row_group_length()),
++          max_row_group_size_(properties.max_row_group_size()),
+           pagesize_(properties.data_pagesize()),
+           version_(properties.version()),
+           data_page_version_(properties.data_page_version()),
+@@ -321,6 +324,13 @@ class PARQUET_EXPORT WriterProperties {
+       return this;
+     }
+
++    /// Specify the max bytes size to put in a single row group.
++    /// Default 128 M.
++    Builder* max_row_group_size(int64_t max_row_group_size) {
++      max_row_group_size_ = max_row_group_size;
++      return this;
++    }
++
+     /// Specify the data page size.
+     /// Default 1MB.
+     Builder* data_pagesize(int64_t pg_size) {
+@@ -664,7 +674,7 @@ class PARQUET_EXPORT WriterProperties {
+
+       return std::shared_ptr<WriterProperties>(new WriterProperties(
+           pool_, dictionary_pagesize_limit_, write_batch_size_, 
max_row_group_length_,
+-          pagesize_, version_, created_by_, page_checksum_enabled_,
++          max_row_group_size_, pagesize_, version_, created_by_, 
page_checksum_enabled_,
+           std::move(file_encryption_properties_), default_column_properties_,
+           column_properties, data_page_version_, store_decimal_as_integer_,
+           std::move(sorting_columns_)));
+@@ -675,6 +685,7 @@ class PARQUET_EXPORT WriterProperties {
+     int64_t dictionary_pagesize_limit_;
+     int64_t write_batch_size_;
+     int64_t max_row_group_length_;
++    int64_t max_row_group_size_;
+     int64_t pagesize_;
+     ParquetVersion::type version_;
+     ParquetDataPageVersion data_page_version_;
+@@ -705,6 +716,8 @@ class PARQUET_EXPORT WriterProperties {
+
+   inline int64_t max_row_group_length() const { return max_row_group_length_; 
}
+
++  inline int64_t max_row_group_size() const { return max_row_group_size_; }
++
+   inline int64_t data_pagesize() const { return pagesize_; }
+
+   inline ParquetDataPageVersion data_page_version() const {
+@@ -810,7 +823,7 @@ class PARQUET_EXPORT WriterProperties {
+  private:
+   explicit WriterProperties(
+       MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t 
write_batch_size,
+-      int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type 
version,
++      int64_t max_row_group_length, int64_t max_row_group_size, int64_t 
pagesize, ParquetVersion::type version,
+       const std::string& created_by, bool page_write_checksum_enabled,
+       std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+       const ColumnProperties& default_column_properties,
+@@ -821,6 +834,7 @@ class PARQUET_EXPORT WriterProperties {
+         dictionary_pagesize_limit_(dictionary_pagesize_limit),
+         write_batch_size_(write_batch_size),
+         max_row_group_length_(max_row_group_length),
++        max_row_group_size_(max_row_group_size),
+         pagesize_(pagesize),
+         parquet_data_page_version_(data_page_version),
+         parquet_version_(version),
+@@ -836,6 +850,7 @@ class PARQUET_EXPORT WriterProperties {
+   int64_t dictionary_pagesize_limit_;
+   int64_t write_batch_size_;
+   int64_t max_row_group_length_;
++  int64_t max_row_group_size_;
+   int64_t pagesize_;
+   ParquetDataPageVersion parquet_data_page_version_;
+   ParquetVersion::type parquet_version_;
+diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+index 9df922afa2..5c8b3d4d07 100644
+--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
+                     REQUIRED_VERSION
+                     0.11.0)
+ 
+-  string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
++  if(NOT Thrift_VERSION)
++    if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
++      set(Thrift_VERSION "${thrift_PC_VERSION}")
++    elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
++      set(Thrift_VERSION "${ThriftAlt_VERSION}")
++    elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
++      set(Thrift_VERSION "${THRIFT_VERSION}")
++    endif()
++  endif()
++  if(NOT Thrift_VERSION)
++    message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift 
dependency")
++  endif()
++
++  string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
+   list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
+   list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
+   list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch 
b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
new file mode 100644
index 00000000000..31af1db7f0f
--- /dev/null
+++ b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
@@ -0,0 +1,331 @@
+diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake
+index 74654a4..4065297 100644
+--- a/cmake_modules/BuildUtils.cmake
++++ b/cmake_modules/BuildUtils.cmake
+@@ -55,12 +55,18 @@ function(add_paimon_lib LIB_NAME)
+     # Necessary to make static linking into other shared libraries work 
properly
+     set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE 
1)
+     if(ARG_DEPENDENCIES)
+-        # Avoid add_dependencies on non-existent targets (e.g. when building 
static only).
++        # In static-only builds, some dependency names are still declared as
++        # *_shared. Map them to *_static when the shared target is 
unavailable.
+         set(_paimon_objlib_deps)
+         foreach(_paimon_dep IN LISTS ARG_DEPENDENCIES)
+-            if(TARGET ${_paimon_dep})
+-                list(APPEND _paimon_objlib_deps ${_paimon_dep})
++            set(_paimon_mapped_dep "${_paimon_dep}")
++            if(NOT TARGET ${_paimon_mapped_dep} AND _paimon_dep MATCHES 
"_shared$")
++                string(REGEX REPLACE "_shared$" "_static" _paimon_mapped_dep 
"${_paimon_dep}")
+             endif()
++            if(TARGET ${_paimon_mapped_dep})
++                list(APPEND _paimon_objlib_deps ${_paimon_mapped_dep})
++            endif()
++            unset(_paimon_mapped_dep)
+         endforeach()
+         if(_paimon_objlib_deps)
+             add_dependencies(${LIB_NAME}_objlib ${_paimon_objlib_deps})
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -909,9 +909,6 @@ macro(build_orc)
+         "-DCMAKE_CXX_FLAGS=${ORC_CMAKE_CXX_FLAGS}"
+         "-DCMAKE_C_FLAGS=${ORC_CMAKE_C_FLAGS}"
+         "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${ORC_CMAKE_CXX_FLAGS}"
+-        "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+-        "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+-        "-DCMAKE_MODULE_LINKER_FLAGS=-Wl,-rpath=${ORC_RPATH}"
+         "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}"
+         "-DLZ4_HOME=${ORC_LZ4_ROOT}"
+         "-DZSTD_HOME=${ORC_ZSTD_ROOT}"
+@@ -923,6 +920,13 @@ macro(build_orc)
+         -DBUILD_TOOLS=OFF
+         -DBUILD_CPP_ENABLE_METRICS=ON)
+
++    if(ORC_RPATH)
++        list(APPEND ORC_CMAKE_ARGS
++             "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
++             "-DCMAKE_SHARED_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
++             "-DCMAKE_MODULE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}")
++    endif()
++
+     set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/orc.diff")
+     externalproject_add(orc_ep
+                         URL ${ORC_SOURCE_URL}
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1024,6 +1024,14 @@ macro(build_arrow)
+         "-DCMAKE_C_FLAGS=${ARROW_CMAKE_C_FLAGS}"
+         "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${ARROW_CMAKE_CXX_FLAGS}"
+         -DARROW_DEPENDENCY_USE_SHARED=OFF
++        # Avoid forcing CONDA dependency mode when CONDA_PREFIX is present.
++        # AUTO keeps the normal "find system first, fallback to bundled"
++        # behavior and prevents accidental pickup of conda's thrift/zstd.
++        -DARROW_DEPENDENCY_SOURCE=AUTO
++        # Isolate from user/system CMake package registries to improve
++        # reproducibility in CI and local mixed environments.
++        -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF
++        -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF
+         -DARROW_BUILD_SHARED=OFF
+         -DARROW_BUILD_STATIC=ON
+         -DARROW_BUILD_TESTS=OFF
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -761,6 +761,9 @@ macro(build_protobuf)
+     get_target_property(THIRDPARTY_ZLIB_INCLUDE_DIR zlib 
INTERFACE_INCLUDE_DIRECTORIES)
+     get_filename_component(THIRDPARTY_ZLIB_ROOT 
"${THIRDPARTY_ZLIB_INCLUDE_DIR}"
+                            DIRECTORY)
++    set(THIRDPARTY_ZLIB_STATIC_LIB
++        
"${THIRDPARTY_ZLIB_ROOT}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX}"
++    )
+
+     # Strip lto flags (which may be added by dh_auto_configure)
+     # See https://github.com/protocolbuffers/protobuf/issues/7092
+@@ -778,6 +781,10 @@ macro(build_protobuf)
+         "-DCMAKE_CXX_FLAGS=${PROTOBUF_CXX_FLAGS}"
+         "-DCMAKE_C_FLAGS=${PROTOBUF_C_FLAGS}"
+         "-DZLIB_ROOT=${THIRDPARTY_ZLIB_ROOT}"
++        "-DZLIB_INCLUDE_DIR=${THIRDPARTY_ZLIB_INCLUDE_DIR}"
++        "-DZLIB_LIBRARY=${THIRDPARTY_ZLIB_STATIC_LIB}"
++        "-DZLIB_LIBRARY_RELEASE=${THIRDPARTY_ZLIB_STATIC_LIB}"
++        "-DZLIB_LIBRARY_DEBUG=${THIRDPARTY_ZLIB_STATIC_LIB}"
+         -Dprotobuf_BUILD_TESTS=OFF
+         -Dprotobuf_DEBUG_POSTFIX=)
+     set(PROTOBUF_CONFIGURE SOURCE_SUBDIR "cmake" CMAKE_ARGS 
${PROTOBUF_CMAKE_ARGS})
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -34,6 +34,16 @@ set(EP_COMMON_TOOLCHAIN 
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+                         "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+
++option(PAIMON_USE_EXTERNAL_ARROW "Reuse external Arrow/Parquet instead of 
building arrow_ep" OFF)
++set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "" CACHE PATH
++    "Include directory for external Arrow/Parquet headers")
++set(PAIMON_EXTERNAL_ARROW_LIB "" CACHE FILEPATH "Path to external libarrow.a")
++set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "" CACHE FILEPATH "Path to external 
libarrow_dataset.a")
++set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "" CACHE FILEPATH "Path to external 
libarrow_acero.a")
++set(PAIMON_EXTERNAL_PARQUET_LIB "" CACHE FILEPATH "Path to external 
libparquet.a")
++set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB "" CACHE FILEPATH
++    "Path to external libarrow_bundled_dependencies.a")
++
+ macro(set_urls URLS)
+     set(${URLS} ${ARGN})
+ endmacro()
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -961,5 +961,95 @@ macro(build_orc)
+ endmacro()
+
+ macro(build_arrow)
+-    message(STATUS "Building Arrow from source")
++    if(PAIMON_USE_EXTERNAL_ARROW)
++        set(ARROW_INCLUDE_DIR 
"${CMAKE_CURRENT_BINARY_DIR}/doris_external_arrow_include")
++        file(MAKE_DIRECTORY "${ARROW_INCLUDE_DIR}")
++        if(NOT EXISTS "${ARROW_INCLUDE_DIR}/arrow")
++            execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++                            "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/arrow"
++                            "${ARROW_INCLUDE_DIR}/arrow")
++        endif()
++        if(EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++           AND NOT EXISTS "${ARROW_INCLUDE_DIR}/parquet")
++            execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++                            "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++                            "${ARROW_INCLUDE_DIR}/parquet")
++        endif()
++
++        if(NOT PAIMON_EXTERNAL_ARROW_INCLUDE_DIR)
++            message(FATAL_ERROR
++                    "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR must be set when 
PAIMON_USE_EXTERNAL_ARROW=ON"
++            )
++        endif()
++        if(NOT EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}")
++            message(FATAL_ERROR
++                    "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR not found: 
${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}"
++            )
++        endif()
++
++        foreach(_paimon_external_lib
++                IN ITEMS PAIMON_EXTERNAL_ARROW_LIB
++                         PAIMON_EXTERNAL_ARROW_DATASET_LIB
++                         PAIMON_EXTERNAL_ARROW_ACERO_LIB
++                         PAIMON_EXTERNAL_PARQUET_LIB
++                         PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB)
++            if(NOT ${_paimon_external_lib})
++                message(FATAL_ERROR
++                        "${_paimon_external_lib} must be set when 
PAIMON_USE_EXTERNAL_ARROW=ON")
++            endif()
++            if(NOT EXISTS "${${_paimon_external_lib}}")
++                message(FATAL_ERROR
++                        "${_paimon_external_lib} not found: 
${${_paimon_external_lib}}")
++            endif()
++        endforeach()
++
++        add_library(arrow STATIC IMPORTED)
++        set_target_properties(arrow
++                              PROPERTIES IMPORTED_LOCATION 
"${PAIMON_EXTERNAL_ARROW_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_dataset STATIC IMPORTED)
++        set_target_properties(arrow_dataset
++                              PROPERTIES IMPORTED_LOCATION
++                                         
"${PAIMON_EXTERNAL_ARROW_DATASET_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_acero STATIC IMPORTED)
++        set_target_properties(arrow_acero
++                              PROPERTIES IMPORTED_LOCATION
++                                         "${PAIMON_EXTERNAL_ARROW_ACERO_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(parquet STATIC IMPORTED)
++        set_target_properties(parquet
++                              PROPERTIES IMPORTED_LOCATION 
"${PAIMON_EXTERNAL_PARQUET_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_bundled_dependencies STATIC IMPORTED)
++        set_target_properties(arrow_bundled_dependencies
++                              PROPERTIES IMPORTED_LOCATION
++                                         
"${PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        target_link_libraries(arrow_acero INTERFACE arrow)
++
++        target_link_libraries(arrow_dataset INTERFACE arrow_acero)
++
++        target_link_libraries(arrow
++                              INTERFACE zstd
++                                        snappy
++                                        lz4
++                                        zlib
++                                        arrow_bundled_dependencies)
++
++        target_link_libraries(parquet
++                              INTERFACE zstd snappy lz4 zlib 
arrow_bundled_dependencies
++                                        arrow_dataset)
++    else()
++        message(STATUS "Building Arrow from source")
+
+     get_target_property(ARROW_SNAPPY_INCLUDE_DIR snappy 
INTERFACE_INCLUDE_DIRECTORIES)
+     get_filename_component(ARROW_SNAPPY_ROOT "${ARROW_SNAPPY_INCLUDE_DIR}" 
DIRECTORY)
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1121,6 +1121,7 @@ macro(build_arrow)
+                                     zlib
+                                     arrow_bundled_dependencies
+                                     arrow_dataset)
++    endif()
+ 
+ endmacro(build_arrow)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -326,10 +326,10 @@ if(PAIMON_ENABLE_LUMINA)
+     include_directories("${CMAKE_SOURCE_DIR}/third_party/lumina/include")
+ endif()
+ 
++include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
+ include_directories(SYSTEM ${TBB_INCLUDE_DIR})
+ 
+-include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ add_compile_definitions("GLOG_USE_GLOG_EXPORT")
+ 
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+diff --git a/src/paimon/common/logging/logging.cpp 
b/src/paimon/common/logging/logging.cpp
+--- a/src/paimon/common/logging/logging.cpp
++++ b/src/paimon/common/logging/logging.cpp
+@@ -83,7 +83,7 @@ std::unique_ptr<Logger> Logger::GetLogger(const std::string& 
path) {
+     }
+     std::unique_lock<std::shared_mutex> ulock(getRegistryLock());
+     if (!google::IsGoogleLoggingInitialized()) {
+-        google::InitGoogleLogging(program_invocation_name);
++        google::InitGoogleLogging("paimon-cpp");
+     }
+     return std::make_unique<GlogAdaptor>();
+ }
+
+diff --git a/src/paimon/common/memory/memory_pool.cpp 
b/src/paimon/common/memory/memory_pool.cpp
+--- a/src/paimon/common/memory/memory_pool.cpp
++++ b/src/paimon/common/memory/memory_pool.cpp
+@@ -55,7 +55,7 @@ void* MemoryPoolImpl::Malloc(uint64_t size, uint64_t 
alignment) {
+     return memptr;
+ }
+
+-void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size, 
size_t alignment) {
++void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size, 
uint64_t alignment) {
+     if (alignment == 0) {
+         void* memptr = ::realloc(p, new_size);
+         total_allocated_size.fetch_add(new_size - old_size);
+
+diff --git a/src/paimon/format/blob/blob_format_writer.cpp 
b/src/paimon/format/blob/blob_format_writer.cpp
+--- a/src/paimon/format/blob/blob_format_writer.cpp
++++ b/src/paimon/format/blob/blob_format_writer.cpp
+@@ -138,7 +138,8 @@ Status BlobFormatWriter::WriteBlob(std::string_view 
blob_data) {
+     }
+     PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, in->Length());
+     uint64_t total_read_length = 0;
+-    uint32_t read_len = std::min(file_length, tmp_buffer_->size());
++    uint32_t read_len =
++        static_cast<uint32_t>(std::min<uint64_t>(file_length, 
tmp_buffer_->size()));
+     while (read_len > 0) {
+         PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, 
in->Read(tmp_buffer_->data(), read_len));
+         if (static_cast<uint32_t>(actual_read_len) != read_len) {
+@@ -149,7 +150,8 @@ Status BlobFormatWriter::WriteBlob(std::string_view 
blob_data) {
+         }
+         PAIMON_RETURN_NOT_OK(WriteWithCrc32(tmp_buffer_->data(), 
actual_read_len));
+         total_read_length += actual_read_len;
+-        read_len = std::min(file_length - total_read_length, 
tmp_buffer_->size());
++        read_len = static_cast<uint32_t>(
++            std::min<uint64_t>(file_length - total_read_length, 
tmp_buffer_->size()));
+     }
+
+     // write bin length
+
+--- a/cmake_modules/arrow.diff
++++ b/cmake_modules/arrow.diff
+@@ -196,3 +196,29 @@
+    int64_t pagesize_;
+    ParquetDataPageVersion parquet_data_page_version_;
+    ParquetVersion::type parquet_version_;
++diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++index 9df922afa2..5c8b3d4d07 100644
++--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
++@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
++                     REQUIRED_VERSION
++                     0.11.0)
++
++-  string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
+++  if(NOT Thrift_VERSION)
+++    if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
+++      set(Thrift_VERSION "${thrift_PC_VERSION}")
+++    elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
+++      set(Thrift_VERSION "${ThriftAlt_VERSION}")
+++    elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
+++      set(Thrift_VERSION "${THRIFT_VERSION}")
+++    endif()
+++  endif()
+++  if(NOT Thrift_VERSION)
+++    message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift 
dependency")
+++  endif()
+++
+++  string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
++   list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
++   list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
++   list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index 23eea97a349..b61978478c5 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -140,10 +140,11 @@ LZO2_SOURCE=lzo-2.10
 LZO2_MD5SUM="39d3f3f9c55c87b1e5d6888e1420f4b5"
 
 # rapidjson
-RAPIDJSON_DOWNLOAD="https://github.com/Tencent/rapidjson/archive/1a803826f1197b5e30703afe4b9c0e7dd48074f5.zip";
-RAPIDJSON_NAME=rapidjson-1a803826f1197b5e30703afe4b9c0e7dd48074f5.zip
-RAPIDJSON_SOURCE=rapidjson-1a803826f1197b5e30703afe4b9c0e7dd48074f5
-RAPIDJSON_MD5SUM="f2212a77e055a15501477f1e390007ea"
+# Updated to match paimon-cpp version (commit 
232389d4f1012dddec4ef84861face2d2ba85709)
+RAPIDJSON_DOWNLOAD="https://github.com/miloyip/rapidjson/archive/232389d4f1012dddec4ef84861face2d2ba85709.tar.gz";
+RAPIDJSON_NAME=rapidjson-232389d4f1012dddec4ef84861face2d2ba85709.tar.gz
+RAPIDJSON_SOURCE=rapidjson-232389d4f1012dddec4ef84861face2d2ba85709
+RAPIDJSON_MD5SUM="577d3495a07b66fcd4a2866c93831bc4"
 
 # curl
 CURL_DOWNLOAD="https://curl.se/download/curl-8.2.1.tar.gz";
@@ -552,6 +553,14 @@ PUGIXML_NAME=pugixml-1.15.tar.gz
 PUGIXML_SOURCE=pugixml-1.15
 PUGIXML_MD5SUM="3b894c29455eb33a40b165c6e2de5895"
 
+# paimon-cpp
+# Using git clone since there's no official release yet
+# We'll use a specific commit or tag for reproducibility
+PAIMON_CPP_GIT_URL="https://github.com/alibaba/paimon-cpp.git";
+PAIMON_CPP_GIT_TAG="0a4f4e2e7967fdb0be180711bbe581a18eeeb2dd"
+PAIMON_CPP_NAME=paimon-cpp
+PAIMON_CPP_SOURCE=paimon-cpp
+
 # all thirdparties which need to be downloaded is set in array TP_ARCHIVES
 export TP_ARCHIVES=(
     'LIBEVENT'
@@ -634,6 +643,7 @@ export TP_ARCHIVES=(
     'ICU'
     'JINDOFS'
     'PUGIXML'
+    'PAIMON_CPP'
 )
 
 if [[ "$(uname -s)" == 'Darwin' ]]; then


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to