https://github.com/serge-sans-paille created https://github.com/llvm/llvm-project/pull/127020
None >From c2d1352aba4872957e34633b92d87c39d0eb7e45 Mon Sep 17 00:00:00 2001 From: serge-sans-paille <sguel...@mozilla.com> Date: Tue, 11 Feb 2025 18:20:15 +0100 Subject: [PATCH 1/2] [clang][cmake] Sanitize CLANG_BOLT values This avoids failing later in the build process. --- clang/tools/driver/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index ad336fcc45b60..5d7962769014a 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -23,10 +23,14 @@ if(CLANG_PLUGIN_SUPPORT) set(support_plugins SUPPORT_PLUGINS) endif() +set(CLANG_BOLT_ALLOWLIST INSTRUMENT PERF LBR) set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \ - May be specified as Instrument or Perf or LBR to use a particular profiling \ +May be specified as one of ${CLANG_BOLT_ALLOWLIST} to use a particular profiling \ mechanism.") string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) +if (CLANG_BOLT AND NOT CLANG_BOLT IN_LIST CLANG_BOLT_ALLOWLIST) + message(FATAL_ERROR "Specified CLANG_BOLT value '${CLANG_BOLT}' is not one of ${CLANG_BOLT_ALLOWLIST}.") +endif() if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj) >From 4c34b09c96735d3af346c19e05149bc553038a8c Mon Sep 17 00:00:00 2001 From: serge-sans-paille <sguel...@mozilla.com> Date: Thu, 13 Feb 2025 08:54:03 +0100 Subject: [PATCH 2/2] [clang][bolt] Improve CLANG_BOLT setup to support shared libraries When linking clang with libLLVM and clang-cpp dynamically, bolt post processing only optimizes the clang binary. This patch makes sure it also instruments libLLVM and libclang-cpp, otherwise optimizing just the clang binary yields limited benefits. This currently only works on Linux due to reliance on LD_PRELOAD to have the instrumented binary use the instrumented shared libraries. --- clang/tools/driver/CMakeLists.txt | 40 ++++-- clang/utils/perf-training/perf-helper.py | 147 ++++++++++++++--------- 2 files changed, 123 insertions(+), 64 deletions(-) diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index 5d7962769014a..10ea5de387220 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) ) set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}") + set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>) + set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED}) + + # Add in dynamically linked libraries, if needs be. Currently only supported + # on Linux because it relies on LD_PRELOAD for instrumentation. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + if (CLANG_LINK_CLANG_DYLIB) + set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING + "Name of BOLT-instrumented Clang library") + set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED}) + list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>) + list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED}) + endif() + if (LLVM_LINK_LLVM_DYLIB) + set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING + "Name of BOLT-instrumented LLVM library") + set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED}) + list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>) + list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED}) + endif() + endif() + # This POST_BUILD command is executed unconditionally even if the clang target # is already built. We need to wrap the whole bolt optimization process in # a single python wrapper, so that we can first check if the binary has @@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) TARGET clang POST_BUILD COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py bolt-optimize - --method ${CLANG_BOLT} - --input $<TARGET_FILE:clang> - --instrumented-output ${CLANG_INSTRUMENTED} - --fdata ${BOLT_FDATA} - --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR} - --readelf $<TARGET_FILE:llvm-readobj> - --bolt $<TARGET_FILE:llvm-bolt> - --lit "${LIT_COMMAND}" - --merge-fdata $<TARGET_FILE:merge-fdata> + --method ${CLANG_BOLT} + --input "${CLANG_BOLT_INPUTS}" + --instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}" + --fdata ${BOLT_FDATA} + --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR} + --readelf $<TARGET_FILE:llvm-readobj> + --bolt $<TARGET_FILE:llvm-bolt> + --lit "${LIT_COMMAND}" + --merge-fdata $<TARGET_FILE:merge-fdata> COMMENT "Optimizing Clang with BOLT" USES_TERMINAL VERBATIM diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 55c5160a71c4f..ea32ef216bcaa 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -559,6 +559,22 @@ def genOrderFile(args): return 0 +def filter_bolt_optimized(inputs, instrumented_outputs) + new_inputs = [] + new_instrumented_ouputs = [] + for input, instrumented_output in zip(inputs, instrumented_outputs): + output = subprocess.check_output( + [opts.readelf, "-WS", input], universal_newlines=True + ) + + # This binary has already been bolt-optimized, so skip further processing. + if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE): + print(f"Skipping {input}, it's already instrumented") + else: + new_inputs.append(input) + new_instrumented_ouputs.append(instrumented_output) + return new_inputs, new_instrumented_ouputs + def bolt_optimize(args): parser = argparse.ArgumentParser("%prog [options] ") @@ -574,47 +590,66 @@ def bolt_optimize(args): opts = parser.parse_args(args) - output = subprocess.check_output( - [opts.readelf, "-WS", opts.input], universal_newlines=True - ) + inputs = opts.input.split(';') + instrumented_outputs = opts.instrumented_output.split(';') + assert len(inputs) == len(instrumented_outputs), "inconsistent --input / --instrumented-output arguments" - # This binary has already been bolt-optimized, so skip further processing. - if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE): + inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs) + if not inputs: return 0 + environ = os.environ.copy() if opts.method == "INSTRUMENT": - process = subprocess.run( - [ - opts.bolt, - opts.input, - "-o", - opts.instrumented_output, - "-instrument", - "--instrumentation-file-append-pid", - f"--instrumentation-file={opts.fdata}", - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) + preloads = [] + for input, instrumented_output in zip(inputs, instrumented_outputs): + args = [ + opts.bolt, + input, + "-o", + instrumented_output, + "-instrument", + "--instrumentation-file-append-pid", + f"--instrumentation-file={opts.fdata}", + ] + print("Running: " + " ".join(args)) + process = subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) - print(process.args) - for line in process.stdout: - sys.stdout.write(line) - process.check_returncode() + for line in process.stdout: + sys.stdout.write(line) + process.check_returncode() - process = subprocess.run( - [ + output = subprocess.check_output( + [opts.readelf, "--file-header", input], universal_newlines=True + ) + if re.search(r"Type:\s*((Shared)|(DYN))", output): + # force using the instrumented version + preloads.append(instrumented_output) + + if preloads: + print("Patching execution environment for dynamic library") + environ["LD_PRELOAD"] = os.pathsep.join(preloads) + + + args = [ sys.executable, opts.lit, - os.path.join(opts.perf_training_binary_dir, "bolt-fdata"), - ], + "-v", + os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"), + ] + print("Running: " + " ".join(args)) + process = subprocess.run( + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + env=environ, ) - print(process.args) for line in process.stdout: sys.stdout.write(line) process.check_returncode() @@ -624,35 +659,37 @@ def bolt_optimize(args): merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir]) - shutil.copy(opts.input, f"{opts.input}-prebolt") + for input in inputs: + shutil.copy(input, f"{input}-prebolt") - process = subprocess.run( - [ - opts.bolt, - f"{opts.input}-prebolt", - "-o", - opts.input, - "-data", - opts.fdata, - "-reorder-blocks=ext-tsp", - "-reorder-functions=cdsort", - "-split-functions", - "-split-all-cold", - "-split-eh", - "-dyno-stats", - "-use-gnu-stack", - "-update-debug-sections", - "-nl" if opts.method == "PERF" else "", - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) + args = [ + opts.bolt, + f"{input}-prebolt", + "-o", + input, + "-data", + opts.fdata, + "-reorder-blocks=ext-tsp", + "-reorder-functions=cdsort", + "-split-functions", + "-split-all-cold", + "-split-eh", + "-dyno-stats", + "-use-gnu-stack", + "-update-debug-sections", + "-nl" if opts.method == "PERF" else "", + ] + print("Running: " + " ".join(args)) + process = subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) - print(process.args) - for line in process.stdout: - sys.stdout.write(line) - process.check_returncode() + for line in process.stdout: + sys.stdout.write(line) + process.check_returncode() commands = { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits