This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 14698c8b99b80db7e6fd99900e32b6742bef1662 Author: Joe McDonnell <[email protected]> AuthorDate: Fri Nov 4 14:33:04 2022 -0700 IMPALA-11603: Build against Cloudflare ZLIB by default Cloudflare Zlib is a fork of the Zlib codebase that has been optimized to take advantage of CPU SIMD instructions and other platform-specific optimizations. It has the same license as regular Zlib. Amazon has touted this as a major speedup over regular Zlib: https://aws.amazon.com/blogs/opensource/improving-zlib-cloudflare-and-comparing-performance-with-other-zlib-forks/ This adds the IMPALA_USE_CLOUDFLARE_ZLIB environment variable which allows Impala to be built against Cloudflare Zlib. This defaults to true. If set to any other value, it will build against regular Zlib. Cloudflare Zlib shows a clear performance benefit over regular Zlib on TPC-H ORC/deflate benchmark: +----------+-------------------+---------+------------+------------+----------------+ | Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) | +----------+-------------------+---------+------------+------------+----------------+ | TPCH(42) | orc / def / block | 4.18 | -6.43% | 3.29 | -6.74% | +----------+-------------------+---------+------------+------------+----------------+ Testing: - Ran GVO tests and exhaustive release tests Change-Id: I82c480890726da0fa5bdc2a646022554eec181f4 Reviewed-on: http://gerrit.cloudera.org:8080/19207 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Michael Smith <[email protected]> Reviewed-by: Wenzhe Zhou <[email protected]> --- CMakeLists.txt | 4 ++++ bin/bootstrap_toolchain.py | 9 +++++---- bin/impala-config.sh | 9 +++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b7c7a65cd..b7e858812 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,6 +143,7 @@ set_dep_root(ZLIB) set_dep_root(CCTZ) set_dep_root(CURL) set_dep_root(CALLONCEHACK) +set_dep_root(CLOUDFLAREZLIB) # The boost-cmake project hasn't been maintained for years. Let's make sure we # don't accidentally use it if it can be found. @@ -214,6 +215,9 @@ IMPALA_ADD_THIRDPARTY_LIB(openssl_crypto "" "" ${OPENSSL_CRYPTO_LIBRARY}) find_package(Bzip2 REQUIRED) IMPALA_ADD_THIRDPARTY_LIB(bzip2 ${BZIP2_INCLUDE_DIR} ${BZIP2_STATIC_LIBRARIES} "") +if ($ENV{IMPALA_USE_CLOUDFLARE_ZLIB} STREQUAL "true") + set(ZLIB_ROOT ${CLOUDFLAREZLIB_ROOT}) +endif() find_package(Zlib REQUIRED) IMPALA_ADD_THIRDPARTY_LIB(zlib ${ZLIB_INCLUDE_DIR} ${ZLIB_STATIC_LIBRARIES} ${ZLIB_SHARED_LIBRARIES}) diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index 78b493d54..59628a3c6 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -465,10 +465,11 @@ def get_toolchain_downloads(): gcc_package = ToolchainPackage("gcc") toolchain_packages += [llvm_package, llvm_package_asserts, gcc_package] toolchain_packages += [ToolchainPackage(p) for p in - ["avro", "binutils", "boost", "breakpad", "bzip2", "calloncehack", "cctz", "cmake", - "crcutil", "curl", "flatbuffers", "gdb", "gflags", "glog", "gperftools", "gtest", - "jwt-cpp", "libev", "libunwind", "lz4", "openldap", "orc", "protobuf", - "python", "rapidjson", "re2", "snappy", "tpc-h", "tpc-ds", "zlib", "zstd"]] + ["avro", "binutils", "boost", "breakpad", "bzip2", "calloncehack", "cctz", + "cloudflarezlib", "cmake", "crcutil", "curl", "flatbuffers", "gdb", "gflags", + "glog", "gperftools", "gtest", "jwt-cpp", "libev", "libunwind", "lz4", "openldap", + "orc", "protobuf", "python", "rapidjson", "re2", "snappy", "tpc-h", "tpc-ds", + "zlib", "zstd"]] python3_package = ToolchainPackage( "python", explicit_version=os.environ.get("IMPALA_PYTHON3_VERSION")) toolchain_packages += [python3_package] diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 558989c4c..5f788c37b 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -173,6 +173,8 @@ export IMPALA_TPC_H_VERSION=2.17.0 unset IMPALA_TPC_H_URL export IMPALA_ZLIB_VERSION=1.2.13 unset IMPALA_ZLIB_URL +export IMPALA_CLOUDFLAREZLIB_VERSION=9e601a3f37 +unset IMPALA_CLOUDFLAREZLIB_URL export IMPALA_CALLONCEHACK_VERSION=1.0.0 unset IMPALA_CALLONCEHACK_URL # Thrift related environment variables. @@ -269,6 +271,13 @@ export IMPALA_REDHAT8_DOCKER_BASE=${IMPALA_REDHAT8_DOCKER_BASE:-"rockylinux:8.5" # Impala's Java code. export IMPALA_DOCKER_USE_JAVA11=${IMPALA_DOCKER_USE_JAVA11:-"false"} +# There are multiple compatible implementations of zlib. Cloudflare Zlib is an +# implementation with optimizations to use platform-specific CPU features that are not +# in the standard Zlib implementation. When set to true, this builds and links against +# Cloudflare Zlib. When false, the build uses the regular Madler Zlib. This defaults +# to true due to the large performance benefits. +export IMPALA_USE_CLOUDFLARE_ZLIB=${IMPALA_USE_CLOUDFLARE_ZLIB:-"true"} + # When IMPALA_(CDP_COMPONENT)_URL are overridden, they may contain '$(platform_label)' # which will be substituted for the CDP platform label in bootstrap_toolchain.py unset IMPALA_HADOOP_URL
