This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 131f0c74a38d6167d9ee5e326b934618d0471e34
Author: Michael Smith <[email protected]>
AuthorDate: Fri Mar 8 08:39:53 2024 -0800

    IMPALA-12939: Bound IMPALA_BUILD_THREADS for cgroups and memory
    
    Updates IMPALA_BUILD_THREADS to bound it based on guideline of 2 GB
    memory per core during builds. Computes cores and memory from cgroup
    limits if applicable; memory is used as a bound on physical memory, as
    sometimes cgroups will report a larger limit than available physical
    memory.
    
    Uses IMPALA_BUILD_THREADS for load-data.
    
    Adds a default in case USER is unset during bootstrap, which can occur
    in devcontainer.
    
    Change-Id: I87994d0464073fe2d91bc2f7c2592c012e42de71
    Reviewed-on: http://gerrit.cloudera.org:8080/21200
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Riza Suminto <[email protected]>
---
 bin/bootstrap_development.sh | 21 +++++++++++++++++-
 bin/bootstrap_system.sh      |  3 ++-
 bin/impala-config.sh         | 52 ++++++++++++++++++++++++++++++++++++++++----
 bin/load-data.py             |  6 ++---
 4 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/bin/bootstrap_development.sh b/bin/bootstrap_development.sh
index f3c552260..1b1009f83 100755
--- a/bin/bootstrap_development.sh
+++ b/bin/bootstrap_development.sh
@@ -41,12 +41,31 @@ set -eu -o pipefail
 
 BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
+# Check available disk space before starting.
+available_disk=$(command df --output=avail --block-size=1G . | sed 1d)
+if [[ $available_disk -lt 80 ]]; then
+  echo "Insufficient disk space ($available_disk GB), \
+Impala requires at least 80GB to build and load test data"
+  exit 1
+fi
+
 source "${BINDIR}/bootstrap_system.sh"
 
 export MAX_PYTEST_FAILURES=0
 source bin/impala-config.sh > /dev/null 2>&1
-export NUM_CONCURRENT_TESTS=$(nproc)
+
+BOUNDED_CONCURRENCY=$((AVAILABLE_MEM / 4))
+if [[ $AVAILABLE_MEM -lt 4 ]]; then
+  echo "Insufficient memory ($AVAILABLE_MEM GB) to link Impala test binaries"
+  echo "Increase memory, or run buildall.sh -format -testdata -notests"
+  exit 1
+elif [[ $BOUNDED_CONCURRENCY -lt $IMPALA_BUILD_THREADS ]]; then
+  echo "Bounding concurrency to $BOUNDED_CONCURRENCY for link phase"
+  IMPALA_BUILD_THREADS=$BOUNDED_CONCURRENCY
+fi
+
 time -p ./buildall.sh -format -testdata -skiptests
 
 # To then run the tests:
+# export NUM_CONCURRENT_TESTS=$(nproc)
 # time -p bin/run-all-tests.sh
diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index 518d728f5..9c28338d7 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -477,7 +477,8 @@ echo -e "\n* - nofile 1048576" | sudo tee -a 
/etc/security/limits.conf
 # uses systemd, which sets its own default. With Ubuntu 18.04 that default is 
16 KB,
 # 20.04+ defaults to 64 MB. Set all to 64 MB for the current user; Impala test 
systems
 # require 10s of GBs of memory, so this setting should not be a problem.
-echo -e "$USER - memlock 65536" | sudo tee 
/etc/security/limits.d/10-memlock.conf
+USER=${USER-$(id -un)}
+echo -e "${USER} - memlock 65536" | sudo tee 
/etc/security/limits.d/10-memlock.conf
 
 # Default on CentOS limits a user to 1024 or 4096 processes (threads) , which 
isn't
 # enough for minicluster with all of its friends.
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index f53b28c95..09b2fba48 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -961,9 +961,22 @@ export IMPALA_ALL_LOGS_DIRS="${IMPALA_CLUSTER_LOGS_DIR}
   ${IMPALA_CUSTOM_CLUSTER_TEST_LOGS_DIR} ${IMPALA_MVN_LOGS_DIR}
   ${IMPALA_TIMEOUT_LOGS_DIR}"
 
+# Compute CPUs, using cgroup limits if present and not "max" (v2) or negative 
(v1)
+awk_divide_roundup='{ cores = $1/$2; print cores==int(cores) ? cores : 
int(cores)+1 }'
+if grep -v max /sys/fs/cgroup/cpu.max >& /dev/null; then
+  # Get CPU limits under cgroups v2
+  CORES=$(awk "$awk_divide_roundup" /sys/fs/cgroup/cpu.max)
+  echo "Detected $CORES cores from cgroups v2"
+elif grep -v '\-' /sys/fs/cgroup/cpu/cpu.cfs_quota_us >& /dev/null; then
+  # Get CPU limits under cgroups v1
+  CORES=$(paste /sys/fs/cgroup/cpu/cpu.cfs_quota_us 
/sys/fs/cgroup/cpu/cpu.cfs_period_us |
+          awk "$awk_divide_roundup")
+  echo "Detected $CORES cores from cgroups v1"
+else
+  CORES=$(getconf _NPROCESSORS_ONLN)
+fi
 # Reduce the concurrency for local tests to half the number of cores in the 
system.
-CORES=$(($(getconf _NPROCESSORS_ONLN) / 2))
-export NUM_CONCURRENT_TESTS="${NUM_CONCURRENT_TESTS-${CORES}}"
+export NUM_CONCURRENT_TESTS="${NUM_CONCURRENT_TESTS-$((CORES / 2))}"
 
 export KUDU_MASTER_HOSTS="${KUDU_MASTER_HOSTS:-${INTERNAL_LISTEN_HOST}}"
 export KUDU_MASTER_PORT="${KUDU_MASTER_PORT:-7051}"
@@ -1099,8 +1112,37 @@ export 
ASAN_SYMBOLIZER_PATH="${IMPALA_TOOLCHAIN_PACKAGES_HOME}/llvm-${IMPALA_LLV
 
 export CLUSTER_DIR="${IMPALA_HOME}/testdata/cluster"
 
-# The number of parallel build processes we should run at a time.
-export IMPALA_BUILD_THREADS=${IMPALA_BUILD_THREADS-"$(nproc)"}
+# The number of parallel build processes we should run at a time. Require 2GB 
memory per
+# core as too many compilation processes can exhaust available memory and fail 
a build.
+if $IS_OSX; then
+  AVAILABLE_MEM=$(($(sysctl -n hw.memsize) / 1024 / 1024 / 1024))
+else
+  # MemTotal:       65550228 kB
+  AVAILABLE_MEM=$(awk '/MemTotal/{print int($2/1024/1024)}' /proc/meminfo)
+fi
+if grep -v max /sys/fs/cgroup/memory.max >& /dev/null; then
+  # Get memory limits under cgroups v2
+  CGROUP_MEM_LIMIT=$(($(cat /sys/fs/cgroup/memory.max) / 1024 / 1024 / 1024))
+  echo "Detected $CGROUP_MEM_LIMIT GB memory limit from cgroups v2"
+elif grep -v '\-' /sys/fs/cgroup/memory/memory.limit_in_bytes >& /dev/null; 
then
+  # Get memory limits under cgroups v1
+  CGROUP_MEM_LIMIT=$((
+      $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) / 1024 / 1024 / 1024))
+  echo "Detected $CGROUP_MEM_LIMIT GB memory limit from cgroups v1"
+else
+  CGROUP_MEM_LIMIT=8589934591 # max int64 bytes in GB
+fi
+AVAILABLE_MEM=$((AVAILABLE_MEM > $CGROUP_MEM_LIMIT ? $CGROUP_MEM_LIMIT : 
$AVAILABLE_MEM))
+BOUNDED_CONCURRENCY=$((AVAILABLE_MEM / 2))
+if [[ $AVAILABLE_MEM -lt 2 ]]; then
+  echo "Insufficient memory ($AVAILABLE_MEM GB) to build Impala"
+  exit 1
+elif [[ $BOUNDED_CONCURRENCY -lt $CORES ]]; then
+  echo "Bounding concurrency for available memory ($AVAILABLE_MEM GB)"
+else
+  BOUNDED_CONCURRENCY=$CORES
+fi
+export IMPALA_BUILD_THREADS=${IMPALA_BUILD_THREADS-"${BOUNDED_CONCURRENCY}"}
 
 # Additional flags to pass to make or ninja.
 export IMPALA_MAKE_FLAGS=${IMPALA_MAKE_FLAGS-}
@@ -1180,6 +1222,8 @@ echo "IMPALA_COS_VERSION      = $IMPALA_COS_VERSION"
 echo "IMPALA_OBS_VERSION      = $IMPALA_OBS_VERSION"
 echo "IMPALA_SYSTEM_PYTHON2   = $IMPALA_SYSTEM_PYTHON2"
 echo "IMPALA_SYSTEM_PYTHON3   = $IMPALA_SYSTEM_PYTHON3"
+echo "IMPALA_BUILD_THREADS    = $IMPALA_BUILD_THREADS"
+echo "NUM_CONCURRENT_TESTS    = $NUM_CONCURRENT_TESTS"
 
 # Kerberos things.  If the cluster exists and is kerberized, source
 # the required environment.  This is required for any hadoop tool to
diff --git a/bin/load-data.py b/bin/load-data.py
index 729dcb95b..d729c024f 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -24,7 +24,6 @@ from __future__ import absolute_import, division, 
print_function
 import collections
 import getpass
 import logging
-import multiprocessing
 import os
 import re
 import sqlparse
@@ -77,8 +76,9 @@ parser.add_option("--use_kerberos", action="store_true", 
default=False,
                   help="Load data on a kerberized cluster.")
 parser.add_option("--principal", default=None, dest="principal",
                   help="Kerberos service principal, required if --use_kerberos 
is set")
-parser.add_option("--num_processes", type="int", 
default=multiprocessing.cpu_count(),
-                  dest="num_processes", help="Number of parallel processes to 
use.")
+parser.add_option("--num_processes", type="int", dest="num_processes",
+                  default=os.environ['IMPALA_BUILD_THREADS'],
+                  help="Number of parallel processes to use.")
 
 options, args = parser.parse_args()
 

Reply via email to