This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 65993cd4a5 GH-49434: [C++][CI] Add golden integration files to IPC
file fuzz corpus (#49440)
65993cd4a5 is described below
commit 65993cd4a5ca3d36c43be900a31eb98e12d7b9f5
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Mar 4 10:20:33 2026 +0100
GH-49434: [C++][CI] Add golden integration files to IPC file fuzz corpus
(#49440)
### Rationale for this change
For the IPC stream fuzzer, we are adding the golden integration files to
the seed corpus, but we are currently not doing the same thing for the file
fuzzer.
### What changes are included in this PR?
1. Add golden IPC integration files to the IPC file fuzzer seed corpus
2. Minor cosmetic changes to the C++ test script, to make the output
slightly less bulky (no functional difference)
### Are these changes tested?
Yes, by existing CI tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #49434
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
ci/scripts/cpp_test.sh | 32 ++++++++++++-------------
cpp/build-support/fuzzing/generate_corpuses.sh | 10 +++++---
cpp/src/parquet/arrow/fuzz_encoding_internal.cc | 3 ++-
3 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index 53a6b26151..921983fdb0 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -191,6 +191,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
# 1. Generate seed corpuses
+ # For IPC fuzz targets, these will include the golden IPC integration
files.
"${source_dir}/build-support/fuzzing/generate_corpuses.sh"
"${binary_output_dir}"
# 2. Run fuzz targets on seed corpus entries
@@ -198,9 +199,11 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
fuzz_target_basename=$1
corpus_dir=${binary_output_dir}/${fuzz_target_basename}_seed_corpus
mkdir -p "${corpus_dir}"
- rm -f "${corpus_dir}"/*
- unzip "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip
-d "${corpus_dir}"
- "${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000
"${corpus_dir}"/*
+ pushd "${corpus_dir}"
+ unzip -q
"${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d .
+ "${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 ./*
+ popd
+ rm -rf "${corpus_dir}"
}
run_fuzz_target_on_seed_corpus arrow-csv-fuzz
run_fuzz_target_on_seed_corpus arrow-ipc-file-fuzz
@@ -212,22 +215,17 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
fi
# 3. Run fuzz targets on regression files from arrow-testing
- # Run golden IPC integration files: these should ideally load without
errors,
- # though some very old ones carry invalid data (such as decimal values
- # larger than their advertised precision).
- # shellcheck disable=SC2046
- "${binary_output_dir}/arrow-ipc-stream-fuzz" $(find
"${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream")
- # shellcheck disable=SC2046
- "${binary_output_dir}/arrow-ipc-file-fuzz" $(find
"${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file")
- # Run known crash files
- "${binary_output_dir}/arrow-ipc-stream-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-*
- "${binary_output_dir}/arrow-ipc-stream-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-*
- "${binary_output_dir}/arrow-ipc-file-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-*
- "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz"
"${ARROW_TEST_DATA}"/arrow-ipc-tensor-stream/*-testcase-*
+ pushd "${ARROW_TEST_DATA}"
+ "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/crash-*
+ "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/*-testcase-*
+ "${binary_output_dir}/arrow-ipc-file-fuzz" arrow-ipc-file/*-testcase-*
+ "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz"
arrow-ipc-tensor-stream/*-testcase-*
if [ "${ARROW_PARQUET}" == "ON" ]; then
- "${binary_output_dir}/parquet-arrow-fuzz"
"${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
+ "${binary_output_dir}/parquet-arrow-fuzz" parquet/fuzzing/*-testcase-*
+ # TODO replay encoding regression files when we have some
fi
- "${binary_output_dir}/arrow-csv-fuzz"
"${ARROW_TEST_DATA}"/csv/fuzzing/*-testcase-*
+ "${binary_output_dir}/arrow-csv-fuzz" csv/fuzzing/*-testcase-*
+ popd
fi
popd
diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh
b/cpp/build-support/fuzzing/generate_corpuses.sh
index 273c2a20d0..07afa793dc 100755
--- a/cpp/build-support/fuzzing/generate_corpuses.sh
+++ b/cpp/build-support/fuzzing/generate_corpuses.sh
@@ -29,7 +29,7 @@ set -ex
CORPUS_DIR=/tmp/corpus
PANDAS_DIR=/tmp/pandas
-ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
+ARROW_ROOT=$(cd $(dirname "$BASH_SOURCE")/../../..; pwd)
ARROW_CPP=$ARROW_ROOT/cpp
OUT=$1
@@ -39,10 +39,11 @@ OUT=$1
# Arrow IPC
-IPC_INTEGRATION_FILES=$(find
${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
-
rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR}
+# Add "golden" IPC integration files
+IPC_INTEGRATION_FILES=$(find
${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
+[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
# Several IPC integration files can have the same name, make sure
# they all appear in the corpus by numbering the duplicates.
cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
@@ -50,6 +51,9 @@ ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py
${CORPUS_DIR} ${OUT}/arrow-ipc
rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
+IPC_INTEGRATION_FILES=$(find
${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.arrow_file")
+[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
+cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR}
${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip
rm -rf ${CORPUS_DIR}
diff --git a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
index a21bc31829..8f0b0cee95 100644
--- a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
+++ b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
@@ -467,7 +467,8 @@ Status FuzzEncoding(const uint8_t* data, int64_t size) {
ARROW_ASSIGN_OR_RAISE(const auto parse_result,
FuzzEncodingHeader::Parse(std::span(data, size)));
- auto& [header, encoded_data] = parse_result;
+ const auto header = parse_result.first;
+ const auto encoded_data = parse_result.second;
if (encoded_data.size() > static_cast<size_t>(kInt32Max)) {
// Unlikely but who knows?
return Status::Invalid("Fuzz payload too large");