https://github.com/aaupov updated https://github.com/llvm/llvm-project/pull/69133
>From 96e644279ccec1970c42cca89c05aac186b872e6 Mon Sep 17 00:00:00 2001 From: Amir Aupov <amir.au...@gmail.com> Date: Mon, 16 Oct 2023 01:08:28 +0200 Subject: [PATCH 1/3] [Clang][CMake] Support perf, LBR, and Instrument CLANG_BOLT options Split up and refactor CLANG_BOLT_INSTRUMENT into support for perf no-LBR and perf with LBR profiling modes. Differential Revision: https://reviews.llvm.org/D143617 --- clang/CMakeLists.txt | 44 ++++++++----- clang/cmake/caches/BOLT.cmake | 2 +- clang/utils/perf-training/CMakeLists.txt | 29 ++++++++- clang/utils/perf-training/bolt.lit.cfg | 53 +++++++++++++--- .../utils/perf-training/bolt.lit.site.cfg.in | 2 + clang/utils/perf-training/perf-helper.py | 63 +++++++++++++++++++ 6 files changed, 167 insertions(+), 26 deletions(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 9b52c58be41e7f7..8f64d95cc394ffe 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -850,23 +850,38 @@ if (CLANG_ENABLE_BOOTSTRAP) endforeach() endif() -if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \ + May be specified as Instrument or Perf or LBR to use a particular profiling \ + mechanism.") +string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) + +if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) - set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) + set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED}) set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata) - # Instrument clang with BOLT - add_custom_target(clang-instrumented - DEPENDS ${CLANG_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} - DEPENDS clang llvm-bolt - COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} - -instrument --instrumentation-file-append-pid - --instrumentation-file=${BOLT_FDATA} - COMMENT "Instrumenting clang binary with BOLT" - VERBATIM - ) + # Pass extra flag in no-LBR mode + if (uppercase_CLANG_BOLT STREQUAL "PERF") + set(BOLT_NO_LBR "-nl") + endif() + + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + # Instrument clang with BOLT + add_custom_target(clang-instrumented + DEPENDS ${CLANG_INSTRUMENTED} + ) + add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} + DEPENDS clang llvm-bolt + COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} + -instrument --instrumentation-file-append-pid + --instrumentation-file=${BOLT_FDATA} + COMMENT "Instrumenting clang binary with BOLT" + VERBATIM + ) + add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented) + else() # perf or LBR + add_custom_target(clang-bolt-training-deps DEPENDS clang) + endif() # Optimize original (pre-bolt) Clang using the collected profile set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt) @@ -880,6 +895,7 @@ if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) -data ${BOLT_FDATA} -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack + ${BOLT_NO_LBR} COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang> COMMENT "Optimizing Clang with BOLT" VERBATIM diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake index 0442f73e5426ac7..eba2346b2f4ca12 100644 --- a/clang/cmake/caches/BOLT.cmake +++ b/clang/cmake/caches/BOLT.cmake @@ -1,5 +1,5 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "") +set(CLANG_BOLT "INSTRUMENT" CACHE STRING "") set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "") set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "") diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index c6d51863fb1b5c2..48fbee62a8636d1 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -62,7 +62,9 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD) DEPENDS generate-dtrace-logs) endif() -if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) + set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING + "Name of BOLT-instrumented Clang binary") configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg @@ -71,16 +73,37 @@ if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang" ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/ EXCLUDE_FROM_CHECK_ALL - DEPENDS clang-instrumented clear-bolt-fdata + DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data ) add_custom_target(clear-bolt-fdata COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata COMMENT "Clearing old BOLT fdata") + add_custom_target(clear-perf-data + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data + COMMENT "Clearing old perf data") + + string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) + if (uppercase_CLANG_BOLT STREQUAL "LBR") + set(BOLT_LBR "--lbr") + endif() + + add_custom_target(merge-fdata-deps) + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + add_dependencies(merge-fdata-deps generate-bolt-fdata) + else() + # Convert perf profiles into fdata + add_custom_target(convert-perf-fdata + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR} + COMMENT "Converting perf files to BOLT fdata" + DEPENDS llvm-bolt generate-bolt-fdata) + add_dependencies(merge-fdata-deps convert-perf-fdata) + endif() + # Merge profiles into one using merge-fdata add_custom_target(clang-bolt-profile COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Merging BOLT fdata" - DEPENDS merge-fdata generate-bolt-fdata) + DEPENDS merge-fdata merge-fdata-deps) endif() diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg index 234ac855bd67c65..d2b6042a1627e19 100644 --- a/clang/utils/perf-training/bolt.lit.cfg +++ b/clang/utils/perf-training/bolt.lit.cfg @@ -6,15 +6,52 @@ import lit.util import os import subprocess -config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/') +clang_binary = "clang" +perf_wrapper = "" +if config.clang_bolt_mode.lower() == "instrument": + clang_binary = config.clang_bolt_name +else: # perf or LBR + perf_wrapper = "%s %s/perf-helper.py perf" % ( + config.python_exe, + config.perf_helper_dir, + ) + if config.clang_bolt_mode.lower() == "lbr": + perf_wrapper += " --lbr" + perf_wrapper += " -- " -config.name = 'Clang Perf Training' -config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test'] +config.clang = os.path.realpath( + lit.util.which(clang_binary, config.clang_tools_dir) +).replace("\\", "/") + +config.name = "Clang Perf Training" +config.suffixes = [ + ".c", + ".cc", + ".cpp", + ".m", + ".mm", + ".cu", + ".ll", + ".cl", + ".s", + ".S", + ".modulemap", + ".test", +] use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") config.test_format = lit.formats.ShTest(use_lit_shell == "0") -config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang))) -config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang))) -config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang))) -config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) ) -config.substitutions.append( ('%test_root', config.test_exec_root ) ) +config.substitutions.append( + ( + "%clang_cpp_skip_driver", + " %s %s --driver-mode=g++ " % (perf_wrapper, config.clang), + ) +) +config.substitutions.append( + ("%clang_cpp", " %s %s --driver-mode=g++ " % (perf_wrapper, config.clang)) +) +config.substitutions.append( + ("%clang_skip_driver", " %s %s " % (perf_wrapper, config.clang)) +) +config.substitutions.append(("%clang", " %s %s " % (perf_wrapper, config.clang))) +config.substitutions.append(("%test_root", config.test_exec_root)) diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in index 3029319673fc26c..54de12701c1ae91 100644 --- a/clang/utils/perf-training/bolt.lit.site.cfg.in +++ b/clang/utils/perf-training/bolt.lit.site.cfg.in @@ -9,6 +9,8 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@" config.target_triple = "@LLVM_TARGET_TRIPLE@" config.python_exe = "@Python3_EXECUTABLE@" config.clang_obj_root = path(r"@CLANG_BINARY_DIR@") +config.clang_bolt_mode = "@CLANG_BOLT@" +config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@" # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg") diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 99d6a3333b6ef08..647bf09ffd24b51 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -67,6 +67,67 @@ def merge_fdata(args): return 0 +def perf(args): + parser = argparse.ArgumentParser( + prog="perf-helper perf", description="perf wrapper for BOLT profile collection" + ) + parser.add_argument( + "--lbr", action="store_true", help="Use perf with branch stacks" + ) + parser.add_argument("cmd", nargs="*", help="") + + # Use python's arg parser to handle all leading option arguments, but pass + # everything else through to perf + first_cmd = next(arg for arg in args if not arg.startswith("--")) + last_arg_idx = args.index(first_cmd) + + opts = parser.parse_args(args[:last_arg_idx]) + cmd = args[last_arg_idx:] + + perf_args = [ + "perf", + "record", + "--event=cycles:u", + "--freq=max", + "--output=%d.perf.data" % os.getpid(), + ] + if opts.lbr: + perf_args += ["--branch-filter=any,u"] + perf_args.extend(cmd) + + start_time = time.time() + subprocess.check_call(perf_args) + + elapsed = time.time() - start_time + print("... data collection took %.4fs" % elapsed) + return 0 + + +def perf2bolt(args): + parser = argparse.ArgumentParser( + prog="perf-helper perf2bolt", + description="perf2bolt conversion wrapper for perf.data files", + ) + parser.add_argument("bolt", help="Path to llvm-bolt") + parser.add_argument("path", help="Path containing perf.data files") + parser.add_argument("binary", help="Input binary") + parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode") + opts = parser.parse_args(args) + + p2b_args = [ + opts.bolt, + opts.binary, + "--aggregate-only", + "--profile-format=yaml", + ] + if not opts.lbr: + p2b_args += ["-nl"] + p2b_args += ["-p"] + for filename in findFilesWithExtension(opts.path, "perf.data"): + subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"]) + return 0 + + def dtrace(args): parser = argparse.ArgumentParser( prog="perf-helper dtrace", @@ -507,6 +568,8 @@ def genOrderFile(args): "cc1": cc1, "gen-order-file": genOrderFile, "merge-fdata": merge_fdata, + "perf": perf, + "perf2bolt": perf2bolt, } >From 5b6fa368c21e483b3d58b48c020e3d2ee63088bd Mon Sep 17 00:00:00 2001 From: Amir Ayupov <aau...@fb.com> Date: Tue, 7 Nov 2023 09:21:20 -0800 Subject: [PATCH 2/3] Address review comments --- clang/CMakeLists.txt | 6 ++-- clang/utils/perf-training/CMakeLists.txt | 6 ++-- clang/utils/perf-training/bolt.lit.cfg | 36 ++++++++++-------------- clang/utils/perf-training/perf-helper.py | 11 ++------ 4 files changed, 24 insertions(+), 35 deletions(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 8f64d95cc394ffe..052171d29f9973d 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -853,7 +853,7 @@ endif() set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \ May be specified as Instrument or Perf or LBR to use a particular profiling \ mechanism.") -string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) +string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) @@ -861,11 +861,11 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata) # Pass extra flag in no-LBR mode - if (uppercase_CLANG_BOLT STREQUAL "PERF") + if (CLANG_BOLT STREQUAL "PERF") set(BOLT_NO_LBR "-nl") endif() - if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + if (CLANG_BOLT STREQUAL "INSTRUMENT") # Instrument clang with BOLT add_custom_target(clang-instrumented DEPENDS ${CLANG_INSTRUMENTED} diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index 48fbee62a8636d1..601f40902fa34ea 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -84,13 +84,13 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data COMMENT "Clearing old perf data") - string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) - if (uppercase_CLANG_BOLT STREQUAL "LBR") + string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) + if (CLANG_BOLT STREQUAL "LBR") set(BOLT_LBR "--lbr") endif() add_custom_target(merge-fdata-deps) - if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + if (CLANG_BOLT STREQUAL "INSTRUMENT") add_dependencies(merge-fdata-deps generate-bolt-fdata) else() # Convert perf profiles into fdata diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg index d2b6042a1627e19..0e81a5501e9fcfc 100644 --- a/clang/utils/perf-training/bolt.lit.cfg +++ b/clang/utils/perf-training/bolt.lit.cfg @@ -6,20 +6,21 @@ import lit.util import os import subprocess +clang_bolt_mode = config.clang_bolt_mode.lower() clang_binary = "clang" -perf_wrapper = "" -if config.clang_bolt_mode.lower() == "instrument": +perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf " + +if clang_bolt_mode == "instrument": + perf_wrapper = "" clang_binary = config.clang_bolt_name -else: # perf or LBR - perf_wrapper = "%s %s/perf-helper.py perf" % ( - config.python_exe, - config.perf_helper_dir, - ) - if config.clang_bolt_mode.lower() == "lbr": - perf_wrapper += " --lbr" +elif clang_bolt_mode == "lbr": + perf_wrapper += " --lbr -- " +elif clang_bolt_mode == "perf": perf_wrapper += " -- " +else: + assert 0, "Unsupported CLANG_BOLT_MODE variable" -config.clang = os.path.realpath( +config.clang = perf_wrapper + os.path.realpath( lit.util.which(clang_binary, config.clang_tools_dir) ).replace("\\", "/") @@ -42,16 +43,9 @@ config.suffixes = [ use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") config.test_format = lit.formats.ShTest(use_lit_shell == "0") config.substitutions.append( - ( - "%clang_cpp_skip_driver", - " %s %s --driver-mode=g++ " % (perf_wrapper, config.clang), - ) -) -config.substitutions.append( - ("%clang_cpp", " %s %s --driver-mode=g++ " % (perf_wrapper, config.clang)) -) -config.substitutions.append( - ("%clang_skip_driver", " %s %s " % (perf_wrapper, config.clang)) + ("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ") ) -config.substitutions.append(("%clang", " %s %s " % (perf_wrapper, config.clang))) +config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ ")) +config.substitutions.append(("%clang_skip_driver", config.clang)) +config.substitutions.append(("%clang", config.clang)) config.substitutions.append(("%test_root", config.test_exec_root)) diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 647bf09ffd24b51..8db6e99bbbe1778 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -74,15 +74,10 @@ def perf(args): parser.add_argument( "--lbr", action="store_true", help="Use perf with branch stacks" ) - parser.add_argument("cmd", nargs="*", help="") + parser.add_argument("cmd", nargs=argparse.REMAINDER, help="") - # Use python's arg parser to handle all leading option arguments, but pass - # everything else through to perf - first_cmd = next(arg for arg in args if not arg.startswith("--")) - last_arg_idx = args.index(first_cmd) - - opts = parser.parse_args(args[:last_arg_idx]) - cmd = args[last_arg_idx:] + opts = parser.parse_args() + cmd = opts["cmd"][1:] perf_args = [ "perf", >From 48309821907cf9e9c65130f2164e4f633281ffec Mon Sep 17 00:00:00 2001 From: Amir Ayupov <aau...@fb.com> Date: Tue, 7 Nov 2023 20:25:44 -0800 Subject: [PATCH 3/3] Bugfix --- clang/utils/perf-training/perf-helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 8db6e99bbbe1778..959bdba5c98ccdb 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -76,8 +76,8 @@ def perf(args): ) parser.add_argument("cmd", nargs=argparse.REMAINDER, help="") - opts = parser.parse_args() - cmd = opts["cmd"][1:] + opts = parser.parse_args(args) + cmd = opts.cmd[1:] perf_args = [ "perf", _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits