================ @@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, uint32_t NumThreads = omp_get_num_threads(); if (NumThreads == 1) return 1; - /* - * This reduce function handles reduction within a team. It handles - * parallel regions in both L1 and L2 parallelism levels. It also - * supports Generic, SPMD, and NoOMP modes. - * - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); + // This reduce function handles reduction within a team. It handles + // parallel regions in both L1 and L2 parallelism levels. It also + // supports Generic, SPMD, and NoOMP modes. + // + // 1. Reduce within a warp. + // 2. Warp master copies value to warp 0 via shared memory. + // 3. Warp 0 reduces to a single value. + // 4. The reduced value is available in the thread that returns 1. + // - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); +#if __has_builtin(__nvvm_reflect) + if (__nvvm_reflect("__CUDA_ARCH") >= 700) { ---------------- shiltian wrote:
I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime. https://github.com/llvm/llvm-project/pull/119091 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits