Copilot commented on code in PR #376:
URL: https://github.com/apache/fluss-rust/pull/376#discussion_r2852976166
##########
bindings/cpp/CMakeLists.txt:
##########
@@ -47,10 +55,113 @@ if (FLUSS_DEV)
set(FLUSS_ENABLE_TESTING ON)
endif()
+if (NOT FLUSS_CPP_DEP_MODE STREQUAL "system" AND NOT FLUSS_CPP_DEP_MODE
STREQUAL "build")
+ message(FATAL_ERROR "Unsupported
FLUSS_CPP_DEP_MODE='${FLUSS_CPP_DEP_MODE}'. Expected 'system' or 'build'.")
+endif()
+
+find_program(FLUSS_PROTOC_EXECUTABLE NAMES protoc)
+if (NOT FLUSS_PROTOC_EXECUTABLE)
+ message(FATAL_ERROR "protoc not found. Install protoc or set it in PATH.
(Fluss baseline: ${FLUSS_CPP_PROTOBUF_VERSION})")
+endif()
+
+if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "" AND EXISTS
"$ENV{CARGO}")
+ set(FLUSS_CARGO_EXECUTABLE "$ENV{CARGO}")
+else()
+ if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "")
+ get_filename_component(_FLUSS_CARGO_HINT_DIR "$ENV{CARGO}" DIRECTORY)
+ endif()
+ find_program(FLUSS_CARGO_EXECUTABLE NAMES cargo HINTS
"${_FLUSS_CARGO_HINT_DIR}")
+endif()
+if (NOT FLUSS_CARGO_EXECUTABLE)
+ message(FATAL_ERROR "cargo not found. Install Rust toolchain or set
CARGO/PATH.")
+endif()
+
+execute_process(
+ COMMAND ${FLUSS_PROTOC_EXECUTABLE} --version
+ OUTPUT_VARIABLE FLUSS_PROTOC_VERSION_OUTPUT
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+)
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" FLUSS_PROTOC_VERSION
"${FLUSS_PROTOC_VERSION_OUTPUT}")
+set(FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION}")
+set(FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION}")
+string(REGEX REPLACE "^3\\." "" FLUSS_PROTOC_VERSION_NORM
"${FLUSS_PROTOC_VERSION_NORM}")
+string(REGEX REPLACE "^3\\." "" FLUSS_CPP_PROTOBUF_VERSION_NORM
"${FLUSS_CPP_PROTOBUF_VERSION_NORM}")
+if (FLUSS_PROTOC_VERSION AND
+ NOT FLUSS_PROTOC_VERSION VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION AND
+ NOT FLUSS_PROTOC_VERSION_NORM VERSION_EQUAL
FLUSS_CPP_PROTOBUF_VERSION_NORM)
+ message(WARNING
+ "protoc version (${FLUSS_PROTOC_VERSION}) does not match Fluss
baseline "
+ "(${FLUSS_CPP_PROTOBUF_VERSION}). Build may still work, but this is
outside the tested baseline.")
+endif()
+
+message(STATUS "Fluss C++ dependency mode: ${FLUSS_CPP_DEP_MODE}")
+message(STATUS "Fluss C++ protoc executable: ${FLUSS_PROTOC_EXECUTABLE}
(${FLUSS_PROTOC_VERSION_OUTPUT})")
+message(STATUS "Fluss C++ cargo executable: ${FLUSS_CARGO_EXECUTABLE}")
+
+if (FLUSS_CPP_DEP_MODE STREQUAL "system")
+ if (FLUSS_CPP_ARROW_SYSTEM_ROOT)
+ list(APPEND CMAKE_PREFIX_PATH "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+ set(Arrow_ROOT "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+ endif()
+
+ find_package(Arrow REQUIRED)
+
+ if (DEFINED Arrow_VERSION AND Arrow_VERSION AND NOT Arrow_VERSION
VERSION_EQUAL FLUSS_CPP_ARROW_VERSION)
+ message(WARNING
+ "Arrow version (${Arrow_VERSION}) does not match Fluss baseline "
+ "(${FLUSS_CPP_ARROW_VERSION}). Build may still work, but this is
outside the tested baseline.")
+ endif()
+else()
+ # Build mode: provision Arrow C++ from source in-tree.
+ set(ARROW_BUILD_SHARED ON CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_STATIC OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_INTEGRATION OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_UTILITIES OFF CACHE BOOL "" FORCE)
+ set(ARROW_COMPUTE OFF CACHE BOOL "" FORCE)
+ set(ARROW_CSV OFF CACHE BOOL "" FORCE)
+ set(ARROW_DATASET OFF CACHE BOOL "" FORCE)
+ set(ARROW_FILESYSTEM OFF CACHE BOOL "" FORCE)
+ set(ARROW_JSON OFF CACHE BOOL "" FORCE)
+ set(ARROW_PARQUET OFF CACHE BOOL "" FORCE)
+ set(ARROW_IPC ON CACHE BOOL "" FORCE)
+ # Reduce third-party sub-build complexity in build mode.
+ set(ARROW_JEMALLOC OFF CACHE BOOL "" FORCE)
+ set(ARROW_MIMALLOC OFF CACHE BOOL "" FORCE)
+ set(ARROW_DEPENDENCY_SOURCE BUNDLED CACHE STRING "" FORCE)
+ set(ARROW_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+ set(ARROW_RUNTIME_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+
+ FetchContent_Declare(
+ apache_arrow_src
+ URL ${FLUSS_CPP_ARROW_SOURCE_URL}
+ SOURCE_SUBDIR cpp
+ )
+ FetchContent_MakeAvailable(apache_arrow_src)
+ set(FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS
+ "${apache_arrow_src_SOURCE_DIR}/cpp/src"
+ "${apache_arrow_src_BINARY_DIR}/src")
+
+ if (TARGET arrow_shared AND NOT TARGET Arrow::arrow_shared)
+ add_library(Arrow::arrow_shared ALIAS arrow_shared)
+ endif()
+ if (NOT TARGET Arrow::arrow_shared)
+ message(FATAL_ERROR "Arrow build mode did not produce target
Arrow::arrow_shared (or arrow_shared).")
+ endif()
+endif()
+
# Get cargo target dir
-execute_process(COMMAND cargo locate-project --workspace --message-format plain
+execute_process(COMMAND ${FLUSS_CARGO_EXECUTABLE} locate-project --workspace
--message-format plain
OUTPUT_VARIABLE CARGO_TARGET_DIR
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+if (NOT CARGO_TARGET_DIR)
+ message(FATAL_ERROR
+ "Failed to resolve Cargo workspace target dir via
'${FLUSS_CARGO_EXECUTABLE} locate-project'. "
+ "Check Rust toolchain installation and PATH/CARGO.")
+endif()
string(REGEX REPLACE "/Cargo.toml\n$" "/target" CARGO_TARGET_DIR
"${CARGO_TARGET_DIR}")
Review Comment:
`execute_process(... locate-project ...)` doesn’t strip trailing whitespace,
and the subsequent regex replacement depends on a `\n` suffix
(`/Cargo.toml\n$`). This is brittle (e.g., `\r\n` line endings or changes in
Cargo output can leave `CARGO_TARGET_DIR` pointing at the *manifest file*
instead of the target dir). Prefer `OUTPUT_STRIP_TRAILING_WHITESPACE` and then
replace `/Cargo.toml$` (or use `get_filename_component(... DIRECTORY)` and
append `/target`).
```suggestion
OUTPUT_VARIABLE CARGO_MANIFEST_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
if (NOT CARGO_MANIFEST_PATH)
message(FATAL_ERROR
"Failed to resolve Cargo workspace target dir via
'${FLUSS_CARGO_EXECUTABLE} locate-project'. "
"Check Rust toolchain installation and PATH/CARGO.")
endif()
get_filename_component(CARGO_WORKSPACE_DIR "${CARGO_MANIFEST_PATH}"
DIRECTORY)
set(CARGO_TARGET_DIR "${CARGO_WORKSPACE_DIR}/target")
```
##########
docs/cpp-bazel-usage.md:
##########
@@ -0,0 +1,276 @@
+# Fluss C++ Bazel Usage Guide (System / Build Modes)
+
+This guide is for:
+
+- C++ application teams consuming Fluss C++ bindings via Bazel
+- Maintainers evolving the Bazel integration
+
+For the CMake flow with the same `system` / `build` dependency modes, see
+`docs/cpp-cmake-usage.md`.
+
+Current simplification scope:
+
+- Keep only two dependency modes in the mainline guidance:
+ - `system`
+ - `build`
+- Defer strict internal-registry-only module flow from the mainline path
+
+## Scope
+
+- Dependency model: **root module mode**
+- Consumer dependency target: `@red-fluss-rust//:fluss_cpp`
+- Build systems covered by this document: **Bazel**
+- Dependency modes covered by this document: **system/build**
+
+Version baseline references currently used by examples:
+
+- `protobuf/protoc`: `3.25.5`
+- `arrow-cpp`: `19.0.1`
+
+## Common Consumer `BUILD.bazel`
+
+Both modes use the same dependency target:
+
+```starlark
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+ name = "fluss_reader",
+ srcs = ["reader.cc"],
+ deps = ["@red-fluss-rust//:fluss_cpp"],
+)
+```
+
+## Mode 1: `system` (Recommended in preinstalled environments)
+
+Use this mode when your environment already provides:
+
+- `protoc`
+- Arrow C++ (headers + shared libraries)
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "red-fluss-rust", version = "0.1.0")
+
+fluss_cpp = use_extension("@red-fluss-rust//bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+ mode = "system",
+ protobuf_version = "3.25.5",
+ arrow_cpp_version = "19.0.1",
+ # Adjust Arrow paths for your environment
+ system_arrow_prefix = "/usr",
+ system_arrow_include_dir = "include",
+ system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so",
+ system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so*",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+### Build and run (consumer workspace pattern)
+
+```bash
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/system`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/system
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:consumer_system
+```
+
+## Mode 2: `build` (No internal registry / no preinstalled Arrow)
+
+Use this mode when Arrow C++ is not preinstalled and you want Bazel to
+provision it from source.
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "red-fluss-rust", version = "0.1.0")
+
+fluss_cpp = use_extension("@red-fluss-rust//bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+ mode = "build",
+ protobuf_version = "3.25.5",
+ arrow_cpp_version = "19.0.1",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+Notes:
+
+- `build` mode in the core Bazel integration still uses `PROTOC` (env / PATH).
+- To auto-download a pinned `protoc` for `build` mode, use
+ `bindings/cpp/scripts/ensure_protoc.sh` and pass the result via
`--action_env=PROTOC=...`.
+- Some environments may require `ep_cmake_ar/ranlib/nm` overrides.
+
+### Build and run (consumer workspace pattern, with auto-downloaded `protoc`)
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+```
+
+```bash
+bazel run --action_env=PROTOC="$PROTOC_BIN" //:fluss_reader
+```
+
+If `cargo` is not on Bazel action `PATH`, also pass:
+
+```bash
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/build`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/build
+PROTOC_BIN="$(bash ../../../scripts/ensure_protoc.sh --print-path)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:consumer_build
+```
+
+## Local Development Override (Optional)
+
+For repository-local validation only:
+
+```starlark
+local_path_override(
+ module_name = "red-fluss-rust",
+ path = "/path/to/fluss-rust",
+)
+```
+
+Do not keep local overrides in long-lived branches.
+
+## Repository-local Validation (Direct Commands)
+
+These commands validate the repository examples directly.
+If your environment requires a proxy for Bazel external downloads, export it
+before running:
+
+```bash
+export BAZEL_PROXY_URL="${BAZEL_PROXY_URL:-http://10.7.4.2:3128}"
+export http_proxy="$BAZEL_PROXY_URL"
+export https_proxy="$BAZEL_PROXY_URL"
+export HTTP_PROXY="$http_proxy"
+export HTTPS_PROXY="$https_proxy"
+unset all_proxy ALL_PROXY
+```
+
+### Validate `build` example
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/build
+PROTOC_BIN="$(bash ../../../scripts/ensure_protoc.sh --print-path)"
+CARGO_BIN="$(command -v cargo)"
+bazel --ignore_all_rc_files run \
+ --registry=https://bcr.bazel.build \
+ --lockfile_mode=off \
+ --repo_env=http_proxy="${http_proxy:-}" \
+ --repo_env=https_proxy="${https_proxy:-}" \
+ --repo_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+ --repo_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+ --action_env=http_proxy="${http_proxy:-}" \
+ --action_env=https_proxy="${https_proxy:-}" \
+ --action_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+ --action_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+ --action_env=all_proxy= \
+ --action_env=ALL_PROXY= \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ --strategy=CcCmakeMakeRule=local \
+ --strategy=BootstrapGNUMake=local \
+ --strategy=BootstrapPkgConfig=local \
+ //:consumer_build
+```
+
+### Validate `system` example (using a local Arrow prefix)
+
+The `system` example defaults to `/usr`. If your Arrow prefix is elsewhere
+(for example a locally built prefix), copy the example to a temp directory and
+patch `MODULE.bazel` before running:
+
+```bash
+tmp_dir="$(mktemp -d /tmp/fluss-bazel-system-doc.XXXXXX)"
+cp -a bindings/cpp/examples/bazel-consumer/system/. "$tmp_dir/"
+sed -i \
+ -e 's|path = "../../../../../"|path = "/home/admin/mh/fluss-r2/fluss-rust"|'
\
+ -e 's|system_arrow_prefix = "/usr"|system_arrow_prefix =
"/tmp/fluss-system-arrow-19.0.1"|' \
+ -e 's|system_arrow_shared_library =
"lib/x86_64-linux-gnu/libarrow.so"|system_arrow_shared_library =
"lib/libarrow.so"|' \
+ -e 's|system_arrow_runtime_glob =
"lib/x86_64-linux-gnu/libarrow.so\\*"|system_arrow_runtime_glob =
"lib/libarrow.so*"|' \
+ "$tmp_dir/MODULE.bazel"
Review Comment:
The troubleshooting example uses `sed -i` (GNU sed syntax). On macOS/BSD
sed, `-i` requires a backup suffix (e.g. `-i ''`), so the command as written
will fail for macOS users. Consider either using a portable approach (e.g.
`perl -pi -e ...`) or documenting the macOS variant alongside the Linux command.
```suggestion
perl -pi -e '
s|path = "../../../../../"|path = "/home/admin/mh/fluss-r2/fluss-rust"|;
s|system_arrow_prefix = "/usr"|system_arrow_prefix =
"/tmp/fluss-system-arrow-19.0.1"|;
s|system_arrow_shared_library =
"lib/x86_64-linux-gnu/libarrow.so"|system_arrow_shared_library =
"lib/libarrow.so"|;
s|system_arrow_runtime_glob =
"lib/x86_64-linux-gnu/libarrow.so\*"|system_arrow_runtime_glob =
"lib/libarrow.so\*|;
' "$tmp_dir/MODULE.bazel"
```
##########
docs/cpp-bazel-usage.md:
##########
@@ -0,0 +1,276 @@
+# Fluss C++ Bazel Usage Guide (System / Build Modes)
+
+This guide is for:
+
+- C++ application teams consuming Fluss C++ bindings via Bazel
+- Maintainers evolving the Bazel integration
+
+For the CMake flow with the same `system` / `build` dependency modes, see
+`docs/cpp-cmake-usage.md`.
+
+Current simplification scope:
+
+- Keep only two dependency modes in the mainline guidance:
+ - `system`
+ - `build`
+- Defer strict internal-registry-only module flow from the mainline path
+
+## Scope
+
+- Dependency model: **root module mode**
+- Consumer dependency target: `@red-fluss-rust//:fluss_cpp`
+- Build systems covered by this document: **Bazel**
+- Dependency modes covered by this document: **system/build**
+
+Version baseline references currently used by examples:
+
+- `protobuf/protoc`: `3.25.5`
+- `arrow-cpp`: `19.0.1`
+
+## Common Consumer `BUILD.bazel`
+
+Both modes use the same dependency target:
+
+```starlark
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+ name = "fluss_reader",
+ srcs = ["reader.cc"],
+ deps = ["@red-fluss-rust//:fluss_cpp"],
+)
+```
+
+## Mode 1: `system` (Recommended in preinstalled environments)
+
+Use this mode when your environment already provides:
+
+- `protoc`
+- Arrow C++ (headers + shared libraries)
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "red-fluss-rust", version = "0.1.0")
+
+fluss_cpp = use_extension("@red-fluss-rust//bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+ mode = "system",
+ protobuf_version = "3.25.5",
+ arrow_cpp_version = "19.0.1",
+ # Adjust Arrow paths for your environment
+ system_arrow_prefix = "/usr",
+ system_arrow_include_dir = "include",
+ system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so",
+ system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so*",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+### Build and run (consumer workspace pattern)
+
+```bash
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/system`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/system
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:consumer_system
+```
+
+## Mode 2: `build` (No internal registry / no preinstalled Arrow)
+
+Use this mode when Arrow C++ is not preinstalled and you want Bazel to
+provision it from source.
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "red-fluss-rust", version = "0.1.0")
+
+fluss_cpp = use_extension("@red-fluss-rust//bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+ mode = "build",
+ protobuf_version = "3.25.5",
+ arrow_cpp_version = "19.0.1",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+Notes:
+
+- `build` mode in the core Bazel integration still uses `PROTOC` (env / PATH).
+- To auto-download a pinned `protoc` for `build` mode, use
+ `bindings/cpp/scripts/ensure_protoc.sh` and pass the result via
`--action_env=PROTOC=...`.
+- Some environments may require `ep_cmake_ar/ranlib/nm` overrides.
+
+### Build and run (consumer workspace pattern, with auto-downloaded `protoc`)
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+```
+
+```bash
+bazel run --action_env=PROTOC="$PROTOC_BIN" //:fluss_reader
+```
+
+If `cargo` is not on Bazel action `PATH`, also pass:
+
+```bash
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/build`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/build
+PROTOC_BIN="$(bash ../../../scripts/ensure_protoc.sh --print-path)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+ --action_env=PROTOC="$PROTOC_BIN" \
+ --action_env=CARGO="$CARGO_BIN" \
+ --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+ //:consumer_build
+```
+
+## Local Development Override (Optional)
+
+For repository-local validation only:
+
+```starlark
+local_path_override(
+ module_name = "red-fluss-rust",
+ path = "/path/to/fluss-rust",
+)
+```
+
+Do not keep local overrides in long-lived branches.
+
+## Repository-local Validation (Direct Commands)
+
+These commands validate the repository examples directly.
+If your environment requires a proxy for Bazel external downloads, export it
+before running:
+
+```bash
+export BAZEL_PROXY_URL="${BAZEL_PROXY_URL:-http://10.7.4.2:3128}"
+export http_proxy="$BAZEL_PROXY_URL"
+export https_proxy="$BAZEL_PROXY_URL"
+export HTTP_PROXY="$http_proxy"
+export HTTPS_PROXY="$https_proxy"
Review Comment:
The suggested default proxy URL (`http://10.7.4.2:3128`) is
environment-specific and may be confusing in upstream docs. Consider removing
the hard-coded default and either leave `BAZEL_PROXY_URL` unset by default or
use a placeholder like `http://proxy.example.com:3128`.
##########
bindings/cpp/scripts/ensure_protoc.sh:
##########
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}"
+PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-/tmp/fluss-cpp-tools}"
+PROTOC_OS="${PROTOC_OS:-linux}"
+PROTOC_ARCH="${PROTOC_ARCH:-x86_64}"
+PROTOC_FORCE_INSTALL="${PROTOC_FORCE_INSTALL:-0}"
+PROTOC_PRINT_PATH_ONLY="${PROTOC_PRINT_PATH_ONLY:-0}"
+
+usage() {
+ cat <<'EOF'
+Usage: bindings/cpp/scripts/ensure_protoc.sh [--print-path]
+
+Ensures a protoc binary matching the configured protobuf baseline is available.
+Installs into a local cache directory (default: /tmp/fluss-cpp-tools) and
prints
+the protoc path on stdout.
+
+Env vars:
+ PROTOBUF_BASELINE_VERSION Baseline protobuf version (default: 3.25.5)
+ PROTOC_INSTALL_ROOT Local cache root (default: /tmp/fluss-cpp-tools)
+ PROTOC_OS protoc package OS (default: linux)
+ PROTOC_ARCH protoc package arch (default: x86_64)
+ PROTOC_FORCE_INSTALL 1 to force re-download
+ BAZEL_PROXY_URL Optional proxy (sets curl/wget proxy envs if
present)
+EOF
+}
+
+for arg in "$@"; do
+ case "$arg" in
+ --print-path)
+ PROTOC_PRINT_PATH_ONLY=1
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown argument: $arg" >&2
+ usage >&2
+ exit 1
+ ;;
+ esac
+done
+
+setup_proxy_env() {
+ if [[ -n "${BAZEL_PROXY_URL:-}" ]]; then
+ export http_proxy="${http_proxy:-$BAZEL_PROXY_URL}"
+ export https_proxy="${https_proxy:-$BAZEL_PROXY_URL}"
+ export HTTP_PROXY="${HTTP_PROXY:-$http_proxy}"
+ export HTTPS_PROXY="${HTTPS_PROXY:-$https_proxy}"
+ fi
+}
+
+normalize_version_for_protoc_release() {
+ local v="$1"
+ # Protobuf release packaging switched from v3.x.y to vX.Y for newer versions.
+ # For our current agreed baseline (3.25.5), the protoc archive/tag is 25.5.
+ if [[ "$v" =~ ^3\.([0-9]+\.[0-9]+)$ ]]; then
+ local stripped="${BASH_REMATCH[1]}"
+ local major="${stripped%%.*}"
+ if [[ "$major" -ge 21 ]]; then
+ echo "$stripped"
+ return 0
+ fi
+ fi
+ echo "$v"
+}
Review Comment:
`normalize_version_for_protoc_release()` doesn’t handle baseline versions
like `3.25.5` correctly. With the current regex it will return `3.25.5`, so the
script will try to download
`.../releases/download/v3.25.5/protoc-3.25.5-...zip`, which doesn’t exist for
modern protobuf releases (they use tags like `v25.5` / archives like
`protoc-25.5-...`). Please update the normalization to map `3.<major>.<patch>`
to `<major>.<patch>` for the affected ranges (e.g. `3.25.5` -> `25.5`) and keep
`version_matches_baseline()` consistent with that mapping.
##########
bindings/cpp/bazel/cpp/BUILD.bazel:
##########
@@ -0,0 +1,10 @@
+package(default_visibility = ["//visibility:public"])
+
+# Stable indirection target for the Arrow C++ dependency. The implementation
+# repo name can change across modes (registry/build/system) without touching
+# bindings/cpp/BUILD.bazel.
+alias(
+ name = "arrow_cpp_dep",
+ actual = "@apache_arrow_cpp//:arrow_cpp",
+)
+
Review Comment:
This `arrow_cpp_dep` alias duplicates `bazel/cpp/BUILD.bazel` but doesn’t
appear to be referenced (the main build uses `//bazel/cpp:arrow_cpp_dep`).
Duplicating the alias in two locations can cause confusion and future drift;
consider deleting this copy if it’s not needed.
```suggestion
```
##########
bindings/cpp/CMakeLists.txt:
##########
@@ -47,10 +55,113 @@ if (FLUSS_DEV)
set(FLUSS_ENABLE_TESTING ON)
endif()
+if (NOT FLUSS_CPP_DEP_MODE STREQUAL "system" AND NOT FLUSS_CPP_DEP_MODE
STREQUAL "build")
+ message(FATAL_ERROR "Unsupported
FLUSS_CPP_DEP_MODE='${FLUSS_CPP_DEP_MODE}'. Expected 'system' or 'build'.")
+endif()
+
+find_program(FLUSS_PROTOC_EXECUTABLE NAMES protoc)
+if (NOT FLUSS_PROTOC_EXECUTABLE)
+ message(FATAL_ERROR "protoc not found. Install protoc or set it in PATH.
(Fluss baseline: ${FLUSS_CPP_PROTOBUF_VERSION})")
+endif()
+
+if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "" AND EXISTS
"$ENV{CARGO}")
+ set(FLUSS_CARGO_EXECUTABLE "$ENV{CARGO}")
+else()
+ if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "")
+ get_filename_component(_FLUSS_CARGO_HINT_DIR "$ENV{CARGO}" DIRECTORY)
+ endif()
+ find_program(FLUSS_CARGO_EXECUTABLE NAMES cargo HINTS
"${_FLUSS_CARGO_HINT_DIR}")
+endif()
+if (NOT FLUSS_CARGO_EXECUTABLE)
+ message(FATAL_ERROR "cargo not found. Install Rust toolchain or set
CARGO/PATH.")
+endif()
+
+execute_process(
+ COMMAND ${FLUSS_PROTOC_EXECUTABLE} --version
+ OUTPUT_VARIABLE FLUSS_PROTOC_VERSION_OUTPUT
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+)
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" FLUSS_PROTOC_VERSION
"${FLUSS_PROTOC_VERSION_OUTPUT}")
+set(FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION}")
+set(FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION}")
+string(REGEX REPLACE "^3\\." "" FLUSS_PROTOC_VERSION_NORM
"${FLUSS_PROTOC_VERSION_NORM}")
+string(REGEX REPLACE "^3\\." "" FLUSS_CPP_PROTOBUF_VERSION_NORM
"${FLUSS_CPP_PROTOBUF_VERSION_NORM}")
+if (FLUSS_PROTOC_VERSION AND
+ NOT FLUSS_PROTOC_VERSION VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION AND
+ NOT FLUSS_PROTOC_VERSION_NORM VERSION_EQUAL
FLUSS_CPP_PROTOBUF_VERSION_NORM)
+ message(WARNING
+ "protoc version (${FLUSS_PROTOC_VERSION}) does not match Fluss
baseline "
+ "(${FLUSS_CPP_PROTOBUF_VERSION}). Build may still work, but this is
outside the tested baseline.")
+endif()
+
+message(STATUS "Fluss C++ dependency mode: ${FLUSS_CPP_DEP_MODE}")
+message(STATUS "Fluss C++ protoc executable: ${FLUSS_PROTOC_EXECUTABLE}
(${FLUSS_PROTOC_VERSION_OUTPUT})")
+message(STATUS "Fluss C++ cargo executable: ${FLUSS_CARGO_EXECUTABLE}")
+
+if (FLUSS_CPP_DEP_MODE STREQUAL "system")
+ if (FLUSS_CPP_ARROW_SYSTEM_ROOT)
+ list(APPEND CMAKE_PREFIX_PATH "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+ set(Arrow_ROOT "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+ endif()
+
+ find_package(Arrow REQUIRED)
+
+ if (DEFINED Arrow_VERSION AND Arrow_VERSION AND NOT Arrow_VERSION
VERSION_EQUAL FLUSS_CPP_ARROW_VERSION)
+ message(WARNING
+ "Arrow version (${Arrow_VERSION}) does not match Fluss baseline "
+ "(${FLUSS_CPP_ARROW_VERSION}). Build may still work, but this is
outside the tested baseline.")
+ endif()
+else()
+ # Build mode: provision Arrow C++ from source in-tree.
+ set(ARROW_BUILD_SHARED ON CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_STATIC OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_INTEGRATION OFF CACHE BOOL "" FORCE)
+ set(ARROW_BUILD_UTILITIES OFF CACHE BOOL "" FORCE)
+ set(ARROW_COMPUTE OFF CACHE BOOL "" FORCE)
+ set(ARROW_CSV OFF CACHE BOOL "" FORCE)
+ set(ARROW_DATASET OFF CACHE BOOL "" FORCE)
+ set(ARROW_FILESYSTEM OFF CACHE BOOL "" FORCE)
+ set(ARROW_JSON OFF CACHE BOOL "" FORCE)
+ set(ARROW_PARQUET OFF CACHE BOOL "" FORCE)
+ set(ARROW_IPC ON CACHE BOOL "" FORCE)
+ # Reduce third-party sub-build complexity in build mode.
+ set(ARROW_JEMALLOC OFF CACHE BOOL "" FORCE)
+ set(ARROW_MIMALLOC OFF CACHE BOOL "" FORCE)
+ set(ARROW_DEPENDENCY_SOURCE BUNDLED CACHE STRING "" FORCE)
+ set(ARROW_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+ set(ARROW_RUNTIME_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+
+ FetchContent_Declare(
+ apache_arrow_src
+ URL ${FLUSS_CPP_ARROW_SOURCE_URL}
+ SOURCE_SUBDIR cpp
+ )
+ FetchContent_MakeAvailable(apache_arrow_src)
Review Comment:
`FetchContent_Declare(apache_arrow_src ...)` pulls the Arrow C++ source from
`${FLUSS_CPP_ARROW_SOURCE_URL}` without specifying a hash or other integrity
verification. If the download is tampered with (e.g., compromised mirror, DNS
poisoning, or MITM), CMake will build and link against attacker-controlled
Arrow code in your C++ bindings. Add an integrity check (such as `URL_HASH`
with a pinned digest) for the Arrow archive to ensure the downloaded contents
are exactly the expected release.
##########
bindings/cpp/scripts/ensure_protoc.sh:
##########
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}"
+PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-/tmp/fluss-cpp-tools}"
Review Comment:
`PROTOC_INSTALL_ROOT` defaults to a shared `/tmp/fluss-cpp-tools` directory,
and the script trusts any existing `protoc` binary found there without
validating its origin or integrity. On multi-user systems another local user
can pre-populate that cache path with a malicious executable that will then be
used by your build, enabling code execution in the victim’s context. Use a
cache location under the invoking user’s home directory (or another
non-world-writable path) and/or validate ownership and contents before trusting
an existing binary.
##########
bazel/cpp/deps.bzl:
##########
@@ -0,0 +1,290 @@
+"""Bzlmod extension for fluss C++ SDK dependency provisioning."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+ name = "all_srcs",
+ srcs = glob(
+ ["**"],
+ exclude = [
+ "**/BUILD",
+ "**/BUILD.bazel",
+ ],
+ ),
+)
+
+cmake(
+ name = "arrow_cpp",
+ lib_source = ":all_srcs",
+ working_directory = "cpp",
+ generate_args = ["-GUnix Makefiles"],
+ cache_entries = {
+ "CMAKE_BUILD_TYPE": "Release",
+ "CMAKE_INSTALL_LIBDIR": "lib",
+ "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+ "ARROW_BUILD_SHARED": "ON",
+ "ARROW_BUILD_STATIC": "OFF",
+ "ARROW_BUILD_TESTS": "OFF",
+ "ARROW_BUILD_EXAMPLES": "OFF",
+ "ARROW_BUILD_BENCHMARKS": "OFF",
+ "ARROW_BUILD_INTEGRATION": "OFF",
+ "ARROW_BUILD_UTILITIES": "OFF",
+ "ARROW_COMPUTE": "OFF",
+ "ARROW_CSV": "OFF",
+ "ARROW_DATASET": "OFF",
+ "ARROW_FILESYSTEM": "OFF",
+ "ARROW_JSON": "OFF",
+ "ARROW_PARQUET": "OFF",
+ "ARROW_IPC": "ON",
+ "ARROW_DEPENDENCY_SOURCE": "BUNDLED",
+ # Temporary workarounds for older images / Bazel sandbox toolchain
detection.
+ "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__",
+ "EP_CMAKE_AR": "__EP_CMAKE_AR__",
+ "EP_CMAKE_NM": "__EP_CMAKE_NM__",
+ },
+ out_include_dir = "include",
+ out_lib_dir = "lib",
+ out_shared_libs = select({
+ "@platforms//os:macos": ["libarrow.dylib"],
+ "//conditions:default": [
+ "libarrow.so",
+ "libarrow.so.1900",
+ "libarrow.so.1900.1.0",
+ ],
+ }),
+)
+"""
+
+_ARROW_PATCH_CMDS = [
+ "sed -i 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define
ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake",
+]
+
+_SYSTEM_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_import(
+ name = "arrow_shared_import",
+ shared_library = "__SYSTEM_ARROW_SHARED_LIBRARY__",
+)
+
+filegroup(
+ name = "arrow_runtime_libs",
+ srcs = [
+__SYSTEM_ARROW_RUNTIME_SRCS__
+ ],
+)
+
+cc_library(
+ name = "arrow_cpp",
+ hdrs = [
+__SYSTEM_ARROW_HDRS__
+ ],
+ includes = ["__SYSTEM_ARROW_INCLUDE_DIR__"],
+ data = [":arrow_runtime_libs"],
+ deps = [":arrow_shared_import"],
+)
+"""
+
+_ARROW_BUILD_VERSIONS = {
+ "19.0.1": {
+ "urls":
["https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz"],
+ "strip_prefix": "arrow-apache-arrow-19.0.1",
+ },
+}
+
+_config_tag = tag_class(attrs = {
+ "mode": attr.string(default = "build"),
+ "arrow_cpp_version": attr.string(default = "19.0.1"),
+ "protobuf_version": attr.string(default = "3.25.5"),
+ "ep_cmake_ranlib": attr.string(default = "ranlib"),
+ "ep_cmake_ar": attr.string(default = "ar"),
+ "ep_cmake_nm": attr.string(default = "nm"),
+ "system_arrow_prefix": attr.string(default = "/usr"),
+ "system_arrow_include_dir": attr.string(default = "include"),
+ "system_arrow_shared_library": attr.string(default =
"lib/x86_64-linux-gnu/libarrow.so"),
+ "system_arrow_runtime_glob": attr.string(default =
"lib/x86_64-linux-gnu/libarrow.so*"),
+})
+
+def _render_arrow_build_file(tag):
+ return _ARROW_BUILD_FILE_TEMPLATE.replace(
+ "__EP_CMAKE_RANLIB__",
+ tag.ep_cmake_ranlib,
+ ).replace(
+ "__EP_CMAKE_AR__",
+ tag.ep_cmake_ar,
+ ).replace(
+ "__EP_CMAKE_NM__",
+ tag.ep_cmake_nm,
+ )
+
+def _render_system_arrow_build_file(tag, shared_library_override = None):
+ shared_library = shared_library_override if shared_library_override else
(tag.system_arrow_shared_library if hasattr(tag, "system_arrow_shared_library")
else tag.shared_library)
+ include_dir = tag.system_arrow_include_dir if hasattr(tag,
"system_arrow_include_dir") else tag.include_dir
+ return _SYSTEM_ARROW_BUILD_FILE_TEMPLATE.replace(
+ "__SYSTEM_ARROW_SHARED_LIBRARY__",
+ "sysroot/" + shared_library,
+ ).replace(
+ "__SYSTEM_ARROW_INCLUDE_DIR__",
+ "sysroot/" + include_dir,
+ )
+
+def _starlark_string_list(items):
+ if not items:
+ return ""
+ return "\n".join([' "%s",' % i for i in items])
+
+def _list_files(repo_ctx, base_dir, suffixes):
+ result = repo_ctx.execute([
+ "/usr/bin/find",
+ base_dir,
+ "-type",
+ "f",
+ ])
+ if result.return_code != 0:
+ fail("failed to enumerate files under %s: %s" % (base_dir,
result.stderr))
+ files = []
+ for line in result.stdout.splitlines():
+ for suffix in suffixes:
+ if line.endswith(suffix):
+ files.append(line)
+ break
+ return sorted(files)
+
+def _system_arrow_repo_impl(repo_ctx):
+ prefix = repo_ctx.attr.prefix.rstrip("/")
+ include_dir = repo_ctx.attr.include_dir
+ shared_library = repo_ctx.attr.shared_library
+ runtime_glob = repo_ctx.attr.runtime_glob
+
+ repo_ctx.execute(["/bin/mkdir", "-p", "sysroot"])
+ copy_res = repo_ctx.execute(["/bin/cp", "-a", prefix + "/.", "sysroot"])
+ if copy_res.return_code != 0:
+ fail("failed to copy system arrow prefix %s: %s" % (prefix,
copy_res.stderr))
+
+ header_root = prefix + "/" + include_dir
+ headers = _list_files(repo_ctx, header_root, [".h", ".hpp"])
+ header_srcs = []
+ for h in headers:
+ if not h.startswith(prefix + "/"):
+ fail("header path %s is outside prefix %s" % (h, prefix))
+ header_srcs.append("sysroot/" + h[len(prefix) + 1:])
+
+ runtime_dir = runtime_glob.rsplit("/", 1)[0]
+ runtime_prefix = runtime_glob.rsplit("/", 1)[1].replace("*", "")
+ runtime_files = _list_files(repo_ctx, prefix + "/" + runtime_dir, [""])
+ runtime_srcs = []
+ for f in runtime_files:
+ rel = f[len(prefix) + 1:] if f.startswith(prefix + "/") else None
+ if rel == None:
+ continue
+ if rel.startswith(runtime_dir + "/") and rel.rsplit("/",
1)[1].startswith(runtime_prefix):
+ runtime_srcs.append("sysroot/" + rel)
+ runtime_srcs = sorted(runtime_srcs)
+
+ # Prefer a versioned soname file as the imported shared library so Bazel
+ # runfiles contain the exact filename required by the runtime loader.
+ shared_import_rel = "sysroot/" + shared_library
+ shared_basename = shared_library.rsplit("/", 1)[1]
+ soname_candidates = []
+ for rel in runtime_srcs:
+ base = rel.rsplit("/", 1)[1]
+ if base == shared_basename:
+ continue
+ if base.startswith(shared_basename + "."):
+ soname_candidates.append(rel)
+ if soname_candidates:
+ # Prefer shortest suffix first (e.g. libarrow.so.1900 before
+ # libarrow.so.1900.1.0) to match ELF SONAME naming when available.
+ soname_candidates = sorted(soname_candidates, key = lambda s: (len(s),
s))
+ shared_import_rel = soname_candidates[0]
+
+ build_file = _render_system_arrow_build_file(repo_ctx.attr,
shared_library_override = shared_import_rel[len("sysroot/"):]).replace(
+ "__SYSTEM_ARROW_HDRS__",
+ _starlark_string_list(header_srcs),
+ ).replace(
+ "__SYSTEM_ARROW_RUNTIME_SRCS__",
+ _starlark_string_list(runtime_srcs),
+ )
+ repo_ctx.file("BUILD.bazel", build_file)
+
+_system_arrow_repository = repository_rule(
+ implementation = _system_arrow_repo_impl,
+ attrs = {
+ "prefix": attr.string(mandatory = True),
+ "include_dir": attr.string(mandatory = True),
+ "shared_library": attr.string(mandatory = True),
+ "runtime_glob": attr.string(mandatory = True),
+ },
+ local = True,
+)
+
+def _select_config(ctx):
+ selected = None
+ selected_owner = None
+ root_selected = None
+ for mod in ctx.modules:
+ for tag in mod.tags.config:
+ is_root = hasattr(mod, "is_root") and mod.is_root
+ if is_root:
+ if root_selected != None:
+ fail("cpp_sdk.config may only be declared once in the root
module")
+ root_selected = tag
+ continue
+ if selected == None:
+ selected = tag
+ selected_owner = mod.name
+ elif selected_owner != mod.name:
+ # Prefer root override. Dependency defaults are tolerated as
long
+ # as they come from a single module.
+ fail("multiple dependency defaults for cpp_sdk.config without
root override")
+ if root_selected != None:
+ return root_selected
+ return selected
+
+def _cpp_sdk_impl(ctx):
+ tag = _select_config(ctx)
+ if tag == None:
+ return
+
+ if tag.mode == "registry":
+ return
+
+ if tag.mode == "system":
+ _system_arrow_repository(
+ name = "apache_arrow_cpp",
+ prefix = tag.system_arrow_prefix,
+ include_dir = tag.system_arrow_include_dir,
+ shared_library = tag.system_arrow_shared_library,
+ runtime_glob = tag.system_arrow_runtime_glob,
+ )
+ return
+
+ if tag.mode != "build":
+ fail("unsupported cpp_sdk mode: %s" % tag.mode)
+
+ arrow_version = _ARROW_BUILD_VERSIONS.get(tag.arrow_cpp_version)
+ if arrow_version == None:
+ fail("unsupported arrow_cpp_version for build mode: %s" %
tag.arrow_cpp_version)
+
+ http_archive(
+ name = "apache_arrow_cpp",
+ urls = arrow_version["urls"],
+ strip_prefix = arrow_version["strip_prefix"],
+ # TODO: Pin sha256/integrity once release packaging is finalized.
+ patch_cmds = _ARROW_PATCH_CMDS,
+ build_file_content = _render_arrow_build_file(tag),
+ )
Review Comment:
The Arrow C++ source is fetched via `http_archive` from GitHub without a
pinned `sha256` or other integrity check. If the remote tarball or the network
path is compromised, Bazel will transparently build and link against
attacker-controlled code in `apache_arrow_cpp`. Add a `sha256` (or equivalent
integrity mechanism) for the Arrow archive to ensure the fetched contents match
the expected release.
##########
bazel/cpp/deps.bzl:
##########
@@ -0,0 +1,290 @@
+"""Bzlmod extension for fluss C++ SDK dependency provisioning."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+ name = "all_srcs",
+ srcs = glob(
+ ["**"],
+ exclude = [
+ "**/BUILD",
+ "**/BUILD.bazel",
+ ],
+ ),
+)
+
+cmake(
+ name = "arrow_cpp",
+ lib_source = ":all_srcs",
+ working_directory = "cpp",
+ generate_args = ["-GUnix Makefiles"],
+ cache_entries = {
+ "CMAKE_BUILD_TYPE": "Release",
+ "CMAKE_INSTALL_LIBDIR": "lib",
+ "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+ "ARROW_BUILD_SHARED": "ON",
+ "ARROW_BUILD_STATIC": "OFF",
+ "ARROW_BUILD_TESTS": "OFF",
+ "ARROW_BUILD_EXAMPLES": "OFF",
+ "ARROW_BUILD_BENCHMARKS": "OFF",
+ "ARROW_BUILD_INTEGRATION": "OFF",
+ "ARROW_BUILD_UTILITIES": "OFF",
+ "ARROW_COMPUTE": "OFF",
+ "ARROW_CSV": "OFF",
+ "ARROW_DATASET": "OFF",
+ "ARROW_FILESYSTEM": "OFF",
+ "ARROW_JSON": "OFF",
+ "ARROW_PARQUET": "OFF",
+ "ARROW_IPC": "ON",
+ "ARROW_DEPENDENCY_SOURCE": "BUNDLED",
+ # Temporary workarounds for older images / Bazel sandbox toolchain
detection.
+ "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__",
+ "EP_CMAKE_AR": "__EP_CMAKE_AR__",
+ "EP_CMAKE_NM": "__EP_CMAKE_NM__",
+ },
+ out_include_dir = "include",
+ out_lib_dir = "lib",
+ out_shared_libs = select({
+ "@platforms//os:macos": ["libarrow.dylib"],
+ "//conditions:default": [
+ "libarrow.so",
+ "libarrow.so.1900",
+ "libarrow.so.1900.1.0",
+ ],
+ }),
+)
+"""
+
+_ARROW_PATCH_CMDS = [
+ "sed -i 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define
ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake",
Review Comment:
`patch_cmds` uses `sed -i ...`, which is not portable across platforms (BSD
sed on macOS requires an argument for `-i`). Since the build file template
already has macOS handling, this will likely break `build` mode on macOS.
Consider replacing this with a portable patch mechanism (e.g., `patch` with an
inline diff, or a small Python/Starlark patch step) so the archive can be
fetched on both Linux and macOS.
```suggestion
"python -c \"from pathlib import Path; p =
Path('cpp/src/arrow/util/config.h.cmake'); s = p.read_text(); s =
s.replace('#define ARROW_CXX_COMPILER_FLAGS \\\"@CMAKE_CXX_FLAGS@\\\"',
'#define ARROW_CXX_COMPILER_FLAGS \\\"\\\"'); p.write_text(s)\"",
```
##########
bindings/cpp/scripts/ensure_protoc.sh:
##########
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}"
+PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-/tmp/fluss-cpp-tools}"
+PROTOC_OS="${PROTOC_OS:-linux}"
+PROTOC_ARCH="${PROTOC_ARCH:-x86_64}"
+PROTOC_FORCE_INSTALL="${PROTOC_FORCE_INSTALL:-0}"
+PROTOC_PRINT_PATH_ONLY="${PROTOC_PRINT_PATH_ONLY:-0}"
+
+usage() {
+ cat <<'EOF'
+Usage: bindings/cpp/scripts/ensure_protoc.sh [--print-path]
+
+Ensures a protoc binary matching the configured protobuf baseline is available.
+Installs into a local cache directory (default: /tmp/fluss-cpp-tools) and
prints
+the protoc path on stdout.
+
+Env vars:
+ PROTOBUF_BASELINE_VERSION Baseline protobuf version (default: 3.25.5)
+ PROTOC_INSTALL_ROOT Local cache root (default: /tmp/fluss-cpp-tools)
+ PROTOC_OS protoc package OS (default: linux)
+ PROTOC_ARCH protoc package arch (default: x86_64)
+ PROTOC_FORCE_INSTALL 1 to force re-download
+ BAZEL_PROXY_URL Optional proxy (sets curl/wget proxy envs if
present)
+EOF
+}
+
+for arg in "$@"; do
+ case "$arg" in
+ --print-path)
+ PROTOC_PRINT_PATH_ONLY=1
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown argument: $arg" >&2
+ usage >&2
+ exit 1
+ ;;
+ esac
+done
+
+setup_proxy_env() {
+ if [[ -n "${BAZEL_PROXY_URL:-}" ]]; then
+ export http_proxy="${http_proxy:-$BAZEL_PROXY_URL}"
+ export https_proxy="${https_proxy:-$BAZEL_PROXY_URL}"
+ export HTTP_PROXY="${HTTP_PROXY:-$http_proxy}"
+ export HTTPS_PROXY="${HTTPS_PROXY:-$https_proxy}"
+ fi
+}
+
+normalize_version_for_protoc_release() {
+ local v="$1"
+ # Protobuf release packaging switched from v3.x.y to vX.Y for newer versions.
+ # For our current agreed baseline (3.25.5), the protoc archive/tag is 25.5.
+ if [[ "$v" =~ ^3\.([0-9]+\.[0-9]+)$ ]]; then
+ local stripped="${BASH_REMATCH[1]}"
+ local major="${stripped%%.*}"
+ if [[ "$major" -ge 21 ]]; then
+ echo "$stripped"
+ return 0
+ fi
+ fi
+ echo "$v"
+}
+
+version_matches_baseline() {
+ local actual="$1"
+ local baseline="$2"
+ local actual_norm baseline_norm
+ actual_norm="$(normalize_version_for_protoc_release "$actual")"
+ baseline_norm="$(normalize_version_for_protoc_release "$baseline")"
+ [[ "$actual" == "$baseline" || "$actual_norm" == "$baseline_norm" ]]
+}
+
+download_file() {
+ local url="$1"
+ local out="$2"
+
+ if command -v curl >/dev/null 2>&1; then
+ if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then
+ curl -fLk "$url" -o "$out"
+ else
+ curl -fL "$url" -o "$out"
+ fi
+ return 0
+ fi
+
+ if command -v wget >/dev/null 2>&1; then
+ local wget_args=()
+ if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then
+ wget_args+=(--no-check-certificate -e use_proxy=yes)
+ if [[ -n "${https_proxy:-}" ]]; then
+ wget_args+=(-e "https_proxy=${https_proxy}")
+ fi
+ if [[ -n "${http_proxy:-}" ]]; then
+ wget_args+=(-e "http_proxy=${http_proxy}")
+ fi
+ fi
+ wget "${wget_args[@]}" -O "$out" "$url"
+ return 0
+ fi
+
+ echo "ERROR: neither curl nor wget is available for downloading protoc." >&2
+ return 1
+}
+
+ensure_zip_tools() {
+ command -v unzip >/dev/null 2>&1 || {
+ echo "ERROR: unzip not found." >&2
+ exit 1
+ }
+}
+
+setup_proxy_env
+ensure_zip_tools
+
+if command -v protoc >/dev/null 2>&1; then
+ existing_out="$(protoc --version 2>/dev/null || true)"
+ if [[ "$existing_out" =~ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
+ existing_ver="${BASH_REMATCH[1]}"
+ if version_matches_baseline "$existing_ver" "$PROTOBUF_BASELINE_VERSION";
then
+ command -v protoc
+ exit 0
+ fi
+ fi
+fi
+
+PROTOC_RELEASE_VERSION="$(normalize_version_for_protoc_release
"$PROTOBUF_BASELINE_VERSION")"
+PROTOC_ARCHIVE="protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}.zip"
+PROTOC_URL="https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_RELEASE_VERSION}/${PROTOC_ARCHIVE}"
+PROTOC_PREFIX="${PROTOC_INSTALL_ROOT}/protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}"
Review Comment:
The `download_file` helper disables TLS certificate verification (`curl
-fLk` and `wget --no-check-certificate`) when proxy variables are set and also
downloads the `protoc` binary from GitHub without any checksum or signature
verification. This makes it possible for a network attacker (e.g., via a
malicious or compromised proxy or MITM) or a compromised distribution channel
to serve a tampered `protoc` archive that will be executed in your build
environment. Harden this by enforcing TLS certificate verification and
validating the downloaded archive against a pinned checksum (or signature)
before extracting and using the binary.
##########
bindings/cpp/scripts/ensure_protoc.sh:
##########
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}"
+PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-/tmp/fluss-cpp-tools}"
+PROTOC_OS="${PROTOC_OS:-linux}"
+PROTOC_ARCH="${PROTOC_ARCH:-x86_64}"
+PROTOC_FORCE_INSTALL="${PROTOC_FORCE_INSTALL:-0}"
+PROTOC_PRINT_PATH_ONLY="${PROTOC_PRINT_PATH_ONLY:-0}"
+
+usage() {
+ cat <<'EOF'
+Usage: bindings/cpp/scripts/ensure_protoc.sh [--print-path]
+
+Ensures a protoc binary matching the configured protobuf baseline is available.
+Installs into a local cache directory (default: /tmp/fluss-cpp-tools) and
prints
+the protoc path on stdout.
+
+Env vars:
+ PROTOBUF_BASELINE_VERSION Baseline protobuf version (default: 3.25.5)
+ PROTOC_INSTALL_ROOT Local cache root (default: /tmp/fluss-cpp-tools)
+ PROTOC_OS protoc package OS (default: linux)
+ PROTOC_ARCH protoc package arch (default: x86_64)
+ PROTOC_FORCE_INSTALL 1 to force re-download
+ BAZEL_PROXY_URL Optional proxy (sets curl/wget proxy envs if
present)
+EOF
+}
+
+for arg in "$@"; do
+ case "$arg" in
+ --print-path)
+ PROTOC_PRINT_PATH_ONLY=1
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown argument: $arg" >&2
+ usage >&2
+ exit 1
+ ;;
+ esac
+done
+
+setup_proxy_env() {
+ if [[ -n "${BAZEL_PROXY_URL:-}" ]]; then
+ export http_proxy="${http_proxy:-$BAZEL_PROXY_URL}"
+ export https_proxy="${https_proxy:-$BAZEL_PROXY_URL}"
+ export HTTP_PROXY="${HTTP_PROXY:-$http_proxy}"
+ export HTTPS_PROXY="${HTTPS_PROXY:-$https_proxy}"
+ fi
+}
+
+normalize_version_for_protoc_release() {
+ local v="$1"
+ # Protobuf release packaging switched from v3.x.y to vX.Y for newer versions.
+ # For our current agreed baseline (3.25.5), the protoc archive/tag is 25.5.
+ if [[ "$v" =~ ^3\.([0-9]+\.[0-9]+)$ ]]; then
+ local stripped="${BASH_REMATCH[1]}"
+ local major="${stripped%%.*}"
+ if [[ "$major" -ge 21 ]]; then
+ echo "$stripped"
+ return 0
+ fi
+ fi
+ echo "$v"
+}
+
+version_matches_baseline() {
+ local actual="$1"
+ local baseline="$2"
+ local actual_norm baseline_norm
+ actual_norm="$(normalize_version_for_protoc_release "$actual")"
+ baseline_norm="$(normalize_version_for_protoc_release "$baseline")"
+ [[ "$actual" == "$baseline" || "$actual_norm" == "$baseline_norm" ]]
+}
+
+download_file() {
+ local url="$1"
+ local out="$2"
+
+ if command -v curl >/dev/null 2>&1; then
+ if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then
+ curl -fLk "$url" -o "$out"
+ else
+ curl -fL "$url" -o "$out"
+ fi
+ return 0
+ fi
+
+ if command -v wget >/dev/null 2>&1; then
+ local wget_args=()
+ if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then
+ wget_args+=(--no-check-certificate -e use_proxy=yes)
+ if [[ -n "${https_proxy:-}" ]]; then
+ wget_args+=(-e "https_proxy=${https_proxy}")
+ fi
+ if [[ -n "${http_proxy:-}" ]]; then
+ wget_args+=(-e "http_proxy=${http_proxy}")
+ fi
Review Comment:
Similar to the curl path, the wget proxy path disables TLS verification
(`--no-check-certificate`). This should not be enabled automatically just
because a proxy is configured; keep verification on by default and require an
explicit opt-in for insecure downloads.
##########
bindings/cpp/bazel/cpp/deps.bzl:
##########
@@ -0,0 +1,290 @@
+"""Bzlmod extension for fluss C++ SDK dependency provisioning."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+ name = "all_srcs",
+ srcs = glob(
+ ["**"],
+ exclude = [
+ "**/BUILD",
+ "**/BUILD.bazel",
+ ],
+ ),
+)
+
+cmake(
+ name = "arrow_cpp",
+ lib_source = ":all_srcs",
+ working_directory = "cpp",
+ generate_args = ["-GUnix Makefiles"],
+ cache_entries = {
+ "CMAKE_BUILD_TYPE": "Release",
+ "CMAKE_INSTALL_LIBDIR": "lib",
+ "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+ "ARROW_BUILD_SHARED": "ON",
+ "ARROW_BUILD_STATIC": "OFF",
+ "ARROW_BUILD_TESTS": "OFF",
+ "ARROW_BUILD_EXAMPLES": "OFF",
+ "ARROW_BUILD_BENCHMARKS": "OFF",
+ "ARROW_BUILD_INTEGRATION": "OFF",
+ "ARROW_BUILD_UTILITIES": "OFF",
+ "ARROW_COMPUTE": "OFF",
+ "ARROW_CSV": "OFF",
+ "ARROW_DATASET": "OFF",
+ "ARROW_FILESYSTEM": "OFF",
+ "ARROW_JSON": "OFF",
+ "ARROW_PARQUET": "OFF",
+ "ARROW_IPC": "ON",
+ "ARROW_DEPENDENCY_SOURCE": "BUNDLED",
+ # Temporary workarounds for older images / Bazel sandbox toolchain
detection.
+ "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__",
+ "EP_CMAKE_AR": "__EP_CMAKE_AR__",
+ "EP_CMAKE_NM": "__EP_CMAKE_NM__",
+ },
+ out_include_dir = "include",
+ out_lib_dir = "lib",
+ out_shared_libs = select({
+ "@platforms//os:macos": ["libarrow.dylib"],
+ "//conditions:default": [
+ "libarrow.so",
+ "libarrow.so.1900",
+ "libarrow.so.1900.1.0",
+ ],
+ }),
+)
+"""
+
+_ARROW_PATCH_CMDS = [
+ "sed -i 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define
ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake",
+]
+
+_SYSTEM_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_import(
+ name = "arrow_shared_import",
+ shared_library = "__SYSTEM_ARROW_SHARED_LIBRARY__",
+)
+
+filegroup(
+ name = "arrow_runtime_libs",
+ srcs = [
+__SYSTEM_ARROW_RUNTIME_SRCS__
+ ],
+)
+
Review Comment:
This file appears to be a byte-for-byte duplicate of `bazel/cpp/deps.bzl`,
but the repository and examples reference `//bazel/cpp:deps.bzl` /
`@red-fluss-rust//bazel/cpp:deps.bzl` (not `bindings/cpp/bazel/cpp`). Keeping
two copies risks them drifting and makes it unclear which one is authoritative.
Consider removing this duplicate (and `bindings/cpp/bazel/cpp/BUILD.bazel`) or
turning it into a thin forwarding wrapper that loads from
`//bazel/cpp:deps.bzl`.
```suggestion
"""Compatibility wrapper for the fluss C++ SDK bzlmod extension.
This file forwards to the canonical implementation in
//bazel/cpp:deps.bzl
to avoid maintaining duplicate logic. Prefer loading from that
location:
load("//bazel/cpp:deps.bzl", "fluss_cpp_deps")
"""
load("//bazel/cpp:deps.bzl", "fluss_cpp_deps")
```
##########
bindings/cpp/bazel/cpp/deps.bzl:
##########
@@ -0,0 +1,290 @@
+"""Bzlmod extension for fluss C++ SDK dependency provisioning."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+ name = "all_srcs",
+ srcs = glob(
+ ["**"],
+ exclude = [
+ "**/BUILD",
+ "**/BUILD.bazel",
+ ],
+ ),
+)
+
+cmake(
+ name = "arrow_cpp",
+ lib_source = ":all_srcs",
+ working_directory = "cpp",
+ generate_args = ["-GUnix Makefiles"],
+ cache_entries = {
+ "CMAKE_BUILD_TYPE": "Release",
+ "CMAKE_INSTALL_LIBDIR": "lib",
+ "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+ "ARROW_BUILD_SHARED": "ON",
+ "ARROW_BUILD_STATIC": "OFF",
+ "ARROW_BUILD_TESTS": "OFF",
+ "ARROW_BUILD_EXAMPLES": "OFF",
+ "ARROW_BUILD_BENCHMARKS": "OFF",
+ "ARROW_BUILD_INTEGRATION": "OFF",
+ "ARROW_BUILD_UTILITIES": "OFF",
+ "ARROW_COMPUTE": "OFF",
+ "ARROW_CSV": "OFF",
+ "ARROW_DATASET": "OFF",
+ "ARROW_FILESYSTEM": "OFF",
+ "ARROW_JSON": "OFF",
+ "ARROW_PARQUET": "OFF",
+ "ARROW_IPC": "ON",
+ "ARROW_DEPENDENCY_SOURCE": "BUNDLED",
+ # Temporary workarounds for older images / Bazel sandbox toolchain
detection.
+ "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__",
+ "EP_CMAKE_AR": "__EP_CMAKE_AR__",
+ "EP_CMAKE_NM": "__EP_CMAKE_NM__",
+ },
+ out_include_dir = "include",
+ out_lib_dir = "lib",
+ out_shared_libs = select({
+ "@platforms//os:macos": ["libarrow.dylib"],
+ "//conditions:default": [
+ "libarrow.so",
+ "libarrow.so.1900",
+ "libarrow.so.1900.1.0",
+ ],
+ }),
+)
+"""
+
+_ARROW_PATCH_CMDS = [
+ "sed -i 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define
ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake",
+]
+
+_SYSTEM_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_import(
+ name = "arrow_shared_import",
+ shared_library = "__SYSTEM_ARROW_SHARED_LIBRARY__",
+)
+
+filegroup(
+ name = "arrow_runtime_libs",
+ srcs = [
+__SYSTEM_ARROW_RUNTIME_SRCS__
+ ],
+)
+
+cc_library(
+ name = "arrow_cpp",
+ hdrs = [
+__SYSTEM_ARROW_HDRS__
+ ],
+ includes = ["__SYSTEM_ARROW_INCLUDE_DIR__"],
+ data = [":arrow_runtime_libs"],
+ deps = [":arrow_shared_import"],
+)
+"""
+
+_ARROW_BUILD_VERSIONS = {
+ "19.0.1": {
+ "urls":
["https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz"],
+ "strip_prefix": "arrow-apache-arrow-19.0.1",
+ },
+}
+
+_config_tag = tag_class(attrs = {
+ "mode": attr.string(default = "build"),
+ "arrow_cpp_version": attr.string(default = "19.0.1"),
+ "protobuf_version": attr.string(default = "3.25.5"),
+ "ep_cmake_ranlib": attr.string(default = "ranlib"),
+ "ep_cmake_ar": attr.string(default = "ar"),
+ "ep_cmake_nm": attr.string(default = "nm"),
+ "system_arrow_prefix": attr.string(default = "/usr"),
+ "system_arrow_include_dir": attr.string(default = "include"),
+ "system_arrow_shared_library": attr.string(default =
"lib/x86_64-linux-gnu/libarrow.so"),
+ "system_arrow_runtime_glob": attr.string(default =
"lib/x86_64-linux-gnu/libarrow.so*"),
+})
+
+def _render_arrow_build_file(tag):
+ return _ARROW_BUILD_FILE_TEMPLATE.replace(
+ "__EP_CMAKE_RANLIB__",
+ tag.ep_cmake_ranlib,
+ ).replace(
+ "__EP_CMAKE_AR__",
+ tag.ep_cmake_ar,
+ ).replace(
+ "__EP_CMAKE_NM__",
+ tag.ep_cmake_nm,
+ )
+
+def _render_system_arrow_build_file(tag, shared_library_override = None):
+ shared_library = shared_library_override if shared_library_override else
(tag.system_arrow_shared_library if hasattr(tag, "system_arrow_shared_library")
else tag.shared_library)
+ include_dir = tag.system_arrow_include_dir if hasattr(tag,
"system_arrow_include_dir") else tag.include_dir
+ return _SYSTEM_ARROW_BUILD_FILE_TEMPLATE.replace(
+ "__SYSTEM_ARROW_SHARED_LIBRARY__",
+ "sysroot/" + shared_library,
+ ).replace(
+ "__SYSTEM_ARROW_INCLUDE_DIR__",
+ "sysroot/" + include_dir,
+ )
+
+def _starlark_string_list(items):
+ if not items:
+ return ""
+ return "\n".join([' "%s",' % i for i in items])
+
+def _list_files(repo_ctx, base_dir, suffixes):
+ result = repo_ctx.execute([
+ "/usr/bin/find",
+ base_dir,
+ "-type",
+ "f",
+ ])
+ if result.return_code != 0:
+ fail("failed to enumerate files under %s: %s" % (base_dir,
result.stderr))
+ files = []
+ for line in result.stdout.splitlines():
+ for suffix in suffixes:
+ if line.endswith(suffix):
+ files.append(line)
+ break
+ return sorted(files)
+
+def _system_arrow_repo_impl(repo_ctx):
+ prefix = repo_ctx.attr.prefix.rstrip("/")
+ include_dir = repo_ctx.attr.include_dir
+ shared_library = repo_ctx.attr.shared_library
+ runtime_glob = repo_ctx.attr.runtime_glob
+
+ repo_ctx.execute(["/bin/mkdir", "-p", "sysroot"])
+ copy_res = repo_ctx.execute(["/bin/cp", "-a", prefix + "/.", "sysroot"])
+ if copy_res.return_code != 0:
+ fail("failed to copy system arrow prefix %s: %s" % (prefix,
copy_res.stderr))
+
+ header_root = prefix + "/" + include_dir
+ headers = _list_files(repo_ctx, header_root, [".h", ".hpp"])
+ header_srcs = []
+ for h in headers:
+ if not h.startswith(prefix + "/"):
+ fail("header path %s is outside prefix %s" % (h, prefix))
+ header_srcs.append("sysroot/" + h[len(prefix) + 1:])
+
+ runtime_dir = runtime_glob.rsplit("/", 1)[0]
+ runtime_prefix = runtime_glob.rsplit("/", 1)[1].replace("*", "")
+ runtime_files = _list_files(repo_ctx, prefix + "/" + runtime_dir, [""])
+ runtime_srcs = []
+ for f in runtime_files:
+ rel = f[len(prefix) + 1:] if f.startswith(prefix + "/") else None
+ if rel == None:
+ continue
+ if rel.startswith(runtime_dir + "/") and rel.rsplit("/",
1)[1].startswith(runtime_prefix):
+ runtime_srcs.append("sysroot/" + rel)
+ runtime_srcs = sorted(runtime_srcs)
+
+ # Prefer a versioned soname file as the imported shared library so Bazel
+ # runfiles contain the exact filename required by the runtime loader.
+ shared_import_rel = "sysroot/" + shared_library
+ shared_basename = shared_library.rsplit("/", 1)[1]
+ soname_candidates = []
+ for rel in runtime_srcs:
+ base = rel.rsplit("/", 1)[1]
+ if base == shared_basename:
+ continue
+ if base.startswith(shared_basename + "."):
+ soname_candidates.append(rel)
+ if soname_candidates:
+ # Prefer shortest suffix first (e.g. libarrow.so.1900 before
+ # libarrow.so.1900.1.0) to match ELF SONAME naming when available.
+ soname_candidates = sorted(soname_candidates, key = lambda s: (len(s),
s))
+ shared_import_rel = soname_candidates[0]
+
+ build_file = _render_system_arrow_build_file(repo_ctx.attr,
shared_library_override = shared_import_rel[len("sysroot/"):]).replace(
+ "__SYSTEM_ARROW_HDRS__",
+ _starlark_string_list(header_srcs),
+ ).replace(
+ "__SYSTEM_ARROW_RUNTIME_SRCS__",
+ _starlark_string_list(runtime_srcs),
+ )
+ repo_ctx.file("BUILD.bazel", build_file)
+
+_system_arrow_repository = repository_rule(
+ implementation = _system_arrow_repo_impl,
+ attrs = {
+ "prefix": attr.string(mandatory = True),
+ "include_dir": attr.string(mandatory = True),
+ "shared_library": attr.string(mandatory = True),
+ "runtime_glob": attr.string(mandatory = True),
+ },
+ local = True,
+)
+
+def _select_config(ctx):
+ selected = None
+ selected_owner = None
+ root_selected = None
+ for mod in ctx.modules:
+ for tag in mod.tags.config:
+ is_root = hasattr(mod, "is_root") and mod.is_root
+ if is_root:
+ if root_selected != None:
+ fail("cpp_sdk.config may only be declared once in the root
module")
+ root_selected = tag
+ continue
+ if selected == None:
+ selected = tag
+ selected_owner = mod.name
+ elif selected_owner != mod.name:
+ # Prefer root override. Dependency defaults are tolerated as
long
+ # as they come from a single module.
+ fail("multiple dependency defaults for cpp_sdk.config without
root override")
+ if root_selected != None:
+ return root_selected
+ return selected
+
+def _cpp_sdk_impl(ctx):
+ tag = _select_config(ctx)
+ if tag == None:
+ return
+
+ if tag.mode == "registry":
+ return
+
+ if tag.mode == "system":
+ _system_arrow_repository(
+ name = "apache_arrow_cpp",
+ prefix = tag.system_arrow_prefix,
+ include_dir = tag.system_arrow_include_dir,
+ shared_library = tag.system_arrow_shared_library,
+ runtime_glob = tag.system_arrow_runtime_glob,
+ )
+ return
+
+ if tag.mode != "build":
+ fail("unsupported cpp_sdk mode: %s" % tag.mode)
+
+ arrow_version = _ARROW_BUILD_VERSIONS.get(tag.arrow_cpp_version)
+ if arrow_version == None:
+ fail("unsupported arrow_cpp_version for build mode: %s" %
tag.arrow_cpp_version)
+
+ http_archive(
+ name = "apache_arrow_cpp",
+ urls = arrow_version["urls"],
+ strip_prefix = arrow_version["strip_prefix"],
+ # TODO: Pin sha256/integrity once release packaging is finalized.
+ patch_cmds = _ARROW_PATCH_CMDS,
+ build_file_content = _render_arrow_build_file(tag),
+ )
Review Comment:
The Arrow C++ source is fetched via `http_archive` from GitHub without a
pinned `sha256` or other integrity check. If the remote tarball or the network
path is compromised, Bazel will transparently build and link against
attacker-controlled code in `apache_arrow_cpp`. Add a `sha256` (or equivalent
integrity mechanism) for the Arrow archive to ensure the fetched contents match
the expected release.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]