[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

Shilei Tian via Phabricator via cfe-commits Mon, 22 Feb 2021 08:31:18 -0800

tianshilei1992 created this revision.
tianshilei1992 added reviewers: jdoerfert, JonChesterfield, ABataev, grokos, 
ye-luo.
Herald added subscribers: guansong, yaxunl, mgorny.
tianshilei1992 requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.


In D97003 <https://reviews.llvm.org/D97003>, CUDA 9.2 is the minimum 
requirement for OpenMP offloading on
NVPTX target. We don't need to have macros in source code to select right 
functions
based on CUDA version. we don't need to compile multiple bitcode libraries of
different CUDA versions for each SM. We don't need to worry about future
compatibility with newer CUDA version.

`-target-feature +ptx60` is used in this patch, which corresponds to the highest
PTX version that CUDA 9.2 can support.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D97198

Files:
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
  clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
  clang/test/Driver/openmp-offload-gpu.c
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
              -Xclang -emit-llvm-bc
              -Xclang -aux-triple -Xclang ${aux_triple}
              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx60
              -D__CUDACC__
              -I${devicertl_base_directory}
              -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-    list(GET cuda_version_list ${itr} cuda_version)
-    list(GET ptx_feature_list ${itr} ptx_num)
-    set(cuda_flags ${sm_flags})
-    list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-    if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-      set(cuda_version_major ${CMAKE_MATCH_1})
-      set(cuda_version_minor ${CMAKE_MATCH_2})
-    else()
-      libomptarget_error_say(
-        "Unrecognized CUDA version format: ${cuda_version}")
-    endif()
-    list(APPEND cuda_flags
-      "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-    set(bc_files "")
-    foreach(src ${cuda_src_files})
-      get_filename_component(infile ${src} ABSOLUTE)
-      get_filename_component(outfile ${src} NAME)
-      set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-      add_custom_command(OUTPUT ${outfile}
-        COMMAND ${cuda_compiler} ${bc_flags}
-          ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-        DEPENDS ${infile}
-        IMPLICIT_DEPENDS CXX ${infile}
-        COMMENT "Building LLVM bitcode ${outfile}"
-        VERBATIM
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-      list(APPEND bc_files ${outfile})
-    endforeach()
-
-    set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
-
-    # Link to a bitcode library.
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-        COMMAND ${bc_linker}
-          -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
-        DEPENDS ${bc_files}
-        COMMENT "Linking LLVM bitcode ${bclib_name}"
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${cuda_src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")
+
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
     )
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
 
-    set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+    list(APPEND bc_files ${outfile})
+  endforeach()
 
-    add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-    add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+  set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
 
-    # Copy library to destination.
-    add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                      ${LIBOMPTARGET_LIBRARY_DIR})
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
 
-    # Install bitcode library under the lib destination folder.
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-  endforeach()
+  set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+  add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 endforeach()
 
 # Test will be enabled if the building machine supports CUDA
Index: clang/test/Driver/openmp-offload-gpu.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -164,7 +164,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-USER %s
 
-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_102-sm_35.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_35.bc
 // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}
 
@@ -177,7 +177,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s
 
-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_102-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
 
 /// ###########################################################################
 
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -711,7 +711,6 @@
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
   clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
-  std::string CudaVersionStr;
 
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
@@ -720,7 +719,6 @@
   switch (CudaInstallationVersion) {
 #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
   case CudaVersion::CUDA_##CUDA_VER:                                           \
-    CudaVersionStr = #CUDA_VER;                                                \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
     CASE_CUDA_VERSION(112, 72);
@@ -734,9 +732,6 @@
     CASE_CUDA_VERSION(90, 60);
 #undef CASE_CUDA_VERSION
   default:
-    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
-    // made in libomptarget/deviceRTLs.
-    CudaVersionStr = "80";
     PtxFeature = "+ptx42";
   }
   CC1Args.append({"-target-feature", PtxFeature});
@@ -757,8 +752,7 @@
       return;
     }
 
-    std::string BitcodeSuffix =
-        "nvptx-cuda_" + CudaVersionStr + "-" + GpuArch.str();
+    std::string BitcodeSuffix = "nvptx-" + GpuArch.str();
     addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                        getTriple());
   }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D97198: [OpenMP][Clang][NVPTX] Only build one bitcode library for each SM

Reply via email to