Author: arpith Date: Thu Feb 16 08:25:35 2017 New Revision: 295323 URL: http://llvm.org/viewvc/llvm-project?rev=295323&view=rev Log: Revert r295319 while investigating buildbot failure.
Removed: cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp cfe/trunk/lib/CodeGen/CodeGenFunction.h Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp (original) +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp Thu Feb 16 08:25:35 2017 @@ -4257,10 +4257,12 @@ static void emitReductionCombiner(CodeGe CGF.EmitIgnoredExpr(ReductionOp); } -llvm::Value *CGOpenMPRuntime::emitReductionFunction( - CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef<const Expr *> Privates, - ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, - ArrayRef<const Expr *> ReductionOps) { +static llvm::Value *emitReductionFunction(CodeGenModule &CGM, + llvm::Type *ArgsType, + ArrayRef<const Expr *> Privates, + ArrayRef<const Expr *> LHSExprs, + ArrayRef<const Expr *> RHSExprs, + ArrayRef<const Expr *> ReductionOps) { auto &C = CGM.getContext(); // void reduction_func(void *LHSArg, void *RHSArg); @@ -4343,11 +4345,11 @@ llvm::Value *CGOpenMPRuntime::emitReduct return Fn; } -void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, - const Expr *ReductionOp, - const Expr *PrivateRef, - const DeclRefExpr *LHS, - const DeclRefExpr *RHS) { +static void emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS) { if (PrivateRef->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast<VarDecl>(LHS->getDecl()); @@ -4367,13 +4369,9 @@ void CGOpenMPRuntime::emitReduction(Code ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, ArrayRef<const Expr *> ReductionOps, - ReductionOptionsTy Options) { + bool WithNowait, bool SimpleReduction) { if (!CGF.HaveInsertPoint()) return; - - bool WithNowait = Options.WithNowait; - bool SimpleReduction = Options.SimpleReduction; - // Next code should be emitted for reduction: // // static kmp_critical_name lock = { 0 }; @@ -4515,13 +4513,12 @@ void CGOpenMPRuntime::emitReduction(Code }; auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps]( CodeGenFunction &CGF, PrePostActionTy &Action) { - auto &RT = CGF.CGM.getOpenMPRuntime(); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { - RT.emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), - cast<DeclRefExpr>(*IRHS)); + emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), + cast<DeclRefExpr>(*IRHS)); ++IPriv; ++ILHS; ++IRHS; Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h (original) +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h Thu Feb 16 08:25:35 2017 @@ -893,32 +893,6 @@ public: OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen, bool HasCancel = false); - - /// Emits reduction function. - /// \param ArgsType Array type containing pointers to reduction variables. - /// \param Privates List of private copies for original reduction arguments. - /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. - /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. - /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' - /// or 'operator binop(LHS, RHS)'. - llvm::Value *emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, - ArrayRef<const Expr *> Privates, - ArrayRef<const Expr *> LHSExprs, - ArrayRef<const Expr *> RHSExprs, - ArrayRef<const Expr *> ReductionOps); - - /// Emits single reduction combiner - void emitSingleReductionCombiner(CodeGenFunction &CGF, - const Expr *ReductionOp, - const Expr *PrivateRef, - const DeclRefExpr *LHS, - const DeclRefExpr *RHS); - - struct ReductionOptionsTy { - bool WithNowait; - bool SimpleReduction; - OpenMPDirectiveKind ReductionKind; - }; /// \brief Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code @@ -955,18 +929,14 @@ public: /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' /// or 'operator binop(LHS, RHS)'. - /// \param Options List of options for reduction codegen: - /// WithNowait true if parent directive has also nowait clause, false - /// otherwise. - /// SimpleReduction Emit reduction operation only. Used for omp simd - /// directive on the host. - /// ReductionKind The kind of reduction to perform. + /// \param WithNowait true if parent directive has also nowait clause, false + /// otherwise. virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, ArrayRef<const Expr *> ReductionOps, - ReductionOptionsTy Options); + bool WithNowait, bool SimpleReduction); /// \brief Emit code for 'taskwait' directive. virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc); Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original) +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Thu Feb 16 08:25:35 2017 @@ -44,20 +44,6 @@ enum OpenMPRTLFunctionNVPTX { /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_end_serialized_parallel, - /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element, - /// int16_t lane_offset, int16_t warp_size); - OMPRTL_NVPTX__kmpc_shuffle_int32, - /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element, - /// int16_t lane_offset, int16_t warp_size); - OMPRTL_NVPTX__kmpc_shuffle_int64, - /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32 - /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - /// lane_offset, int16_t shortCircuit), - /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, - /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); - OMPRTL_NVPTX__kmpc_end_reduce_nowait }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -114,25 +100,6 @@ public: } ~ExecutionModeRAII() { Mode = SavedMode; } }; - -/// GPU Configuration: This information can be derived from cuda registers, -/// however, providing compile time constants helps generate more efficient -/// code. For all practical purposes this is fine because the configuration -/// is the same for all known NVPTX architectures. -enum MachineConfiguration : unsigned { - WarpSize = 32, - /// Number of bits required to represent a lane identifier, which is - /// computed as log_2(WarpSize). - LaneIDBits = 5, - LaneIDMask = WarpSize - 1, -}; - -enum NamedBarrier : unsigned { - /// Synchronize on this barrier #ID using a named barrier primitive. - /// Only the subset of active threads in a parallel region arrive at the - /// barrier. - NB_Parallel = 1, -}; } // anonymous namespace /// Get the GPU warp size. @@ -153,23 +120,6 @@ static llvm::Value *getNVPTXThreadID(Cod llvm::None, "nvptx_tid"); } -/// Get the id of the warp in the block. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); -} - -/// Get the id of the current lane in the Warp. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), - "nvptx_lane_id"); -} - /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; @@ -186,25 +136,9 @@ static void getNVPTXCTABarrier(CodeGenFu &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } -/// Get barrier #ID to synchronize selected (multiple of warp size) threads in -/// a CTA. -static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, - llvm::Value *NumThreads) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; - Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(), - llvm::Intrinsic::nvvm_barrier), - Args); -} - /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } -/// Synchronize worker threads in a parallel region. -static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) { - return getNVPTXBarrier(CGF, NB_Parallel, NumThreads); -} - /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -649,60 +583,6 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntime RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); break; } - case OMPRTL_NVPTX__kmpc_shuffle_int32: { - // Build int32_t __kmpc_shuffle_int32(int32_t element, - // int16_t lane_offset, int16_t warp_size); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; - llvm::FunctionType *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32"); - break; - } - case OMPRTL_NVPTX__kmpc_shuffle_int64: { - // Build int64_t __kmpc_shuffle_int64(int64_t element, - // int16_t lane_offset, int16_t warp_size); - llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty}; - llvm::FunctionType *FnTy = - llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); - break; - } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: { - // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid, - // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t Algorithm Version), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); - llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, - CGM.Int16Ty, CGM.Int16Ty}; - auto *ShuffleReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; - auto *InterWarpCopyFnTy = - llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, - /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, - CGM.Int32Ty, - CGM.SizeTy, - CGM.VoidPtrTy, - ShuffleReduceFnTy->getPointerTo(), - InterWarpCopyFnTy->getPointerTo()}; - llvm::FunctionType *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); - break; - } - case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { - // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); - llvm::Type *TypeParams[] = {CGM.Int32Ty}; - llvm::FunctionType *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); - break; - } } return RTLFn; } @@ -925,891 +805,3 @@ void CGOpenMPRuntimeNVPTX::emitSpmdParal OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); } - -/// This function creates calls to one of two shuffle functions to copy -/// variables between lanes in a warp. -static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, - QualType ElemTy, - llvm::Value *Elem, - llvm::Value *Offset) { - auto &CGM = CGF.CGM; - auto &C = CGM.getContext(); - auto &Bld = CGF.Builder; - CGOpenMPRuntimeNVPTX &RT = - *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime())); - - unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity(); - assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction."); - - OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4 - ? OMPRTL_NVPTX__kmpc_shuffle_int32 - : OMPRTL_NVPTX__kmpc_shuffle_int64; - - // Cast all types to 32- or 64-bit values before calling shuffle routines. - auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty; - auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy); - auto *WarpSize = CGF.EmitScalarConversion( - getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true), - C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation()); - - auto *ShuffledVal = - CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn), - {ElemCast, Offset, WarpSize}); - - return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy)); -} - -namespace { -enum CopyAction : unsigned { - // RemoteLaneToThread: Copy over a Reduce list from a remote lane in - // the warp using shuffle instructions. - RemoteLaneToThread, - // ThreadCopy: Make a copy of a Reduce list on the thread's stack. - ThreadCopy, -}; -} // namespace - -/// Emit instructions to copy a Reduce list, which contains partially -/// aggregated values, in the specified direction. -static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, - QualType ReductionArrayTy, - ArrayRef<const Expr *> Privates, - Address SrcBase, Address DestBase, - llvm::Value *RemoteLaneOffset = nullptr) { - - auto &CGM = CGF.CGM; - auto &C = CGM.getContext(); - auto &Bld = CGF.Builder; - - // Iterates, element-by-element, through the source Reduce list and - // make a copy. - unsigned Idx = 0; - for (auto &Private : Privates) { - Address SrcElementAddr = Address::invalid(); - Address DestElementAddr = Address::invalid(); - Address DestElementPtrAddr = Address::invalid(); - // Should we shuffle in an element from a remote lane? - bool ShuffleInElement = false; - // Set to true to update the pointer in the dest Reduce list to a - // newly created element. - bool UpdateDestListPtr = false; - - switch (Action) { - case RemoteLaneToThread: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = - Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); - llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( - SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - SrcElementAddr = - Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); - - // Step 1.2: Create a temporary to store the element in the destination - // Reduce list. - DestElementPtrAddr = - Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); - DestElementAddr = - CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); - ShuffleInElement = true; - UpdateDestListPtr = true; - break; - } - case ThreadCopy: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = - Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); - llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( - SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - SrcElementAddr = - Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); - - // Step 1.2: Get the address for dest element. The destination - // element has already been created on the thread's stack. - DestElementPtrAddr = - Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); - llvm::Value *DestElementPtr = - CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()); - Address DestElemAddr = - Address(DestElementPtr, C.getTypeAlignInChars(Private->getType())); - DestElementAddr = Bld.CreateElementBitCast( - DestElemAddr, CGF.ConvertTypeForMem(Private->getType())); - break; - } - } - - // Regardless of src and dest of copy, we emit the load of src - // element as this is required in all directions - SrcElementAddr = Bld.CreateElementBitCast( - SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); - llvm::Value *Elem = - CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, - Private->getType(), SourceLocation()); - - // Now that all active lanes have read the element in the - // Reduce list, shuffle over the value from the remote lane. - if (ShuffleInElement) { - Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem, - RemoteLaneOffset); - } - - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, - Private->getType()); - - // Step 3.1: Modify reference in dest Reduce list as needed. - // Modifying the reference in Reduce list to point to the newly - // created element. The element is live in the current function - // scope and that of functions it invokes (i.e., reduce_function). - // RemoteReduceData[i] = (void*)&RemoteElem - if (UpdateDestListPtr) { - CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( - DestElementAddr.getPointer(), CGF.VoidPtrTy), - DestElementPtrAddr, /*Volatile=*/false, - C.VoidPtrTy); - } - - Idx++; - } -} - -/// This function emits a helper that gathers Reduce lists from the first -/// lane of every active warp to lanes in the first warp. -/// -/// void inter_warp_copy_func(void* reduce_data, num_warps) -/// shared smem[warp_size]; -/// For all data entries D in reduce_data: -/// If (I am the first lane in each warp) -/// Copy my local D to smem[warp_id] -/// sync -/// if (I am the first warp) -/// Copy smem[thread_id] to my local D -/// sync -static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, - ArrayRef<const Expr *> Privates, - QualType ReductionArrayTy) { - auto &C = CGM.getContext(); - auto &M = CGM.getModule(); - - // ReduceList: thread local Reduce list. - // At the stage of the computation when this function is called, partially - // aggregated values reside in the first lane of every active warp. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.VoidPtrTy); - // NumWarps: number of warps active in the parallel region. This could - // be smaller than 32 (max warps in a CTA) for partial block reduction. - ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, - C.getIntTypeForBitwidth(32, /* Signed */ true)); - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&NumWarpsArg); - - auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_inter_warp_copy_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); - CodeGenFunction CGF(CGM); - // We don't need debug information in this function as nothing here refers to - // user code. - CGF.disableDebugInfo(); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); - - auto &Bld = CGF.Builder; - - // This array is used as a medium to transfer, one reduce element at a time, - // the data from the first lane of every warp to lanes in the first warp - // in order to perform the final step of a reduction in a parallel region - // (reduction across warps). The array is placed in NVPTX __shared__ memory - // for reduced latency, as well as to have a distinct copy for concurrently - // executing target regions. The array is declared with common linkage so - // as to be shared across compilation units. - const char *TransferMediumName = - "__openmp_nvptx_data_transfer_temporary_storage"; - llvm::GlobalVariable *TransferMedium = - M.getGlobalVariable(TransferMediumName); - if (!TransferMedium) { - auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize); - unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); - TransferMedium = new llvm::GlobalVariable( - M, Ty, - /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, - llvm::Constant::getNullValue(Ty), TransferMediumName, - /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - SharedAddressSpace); - } - - // Get the CUDA thread id of the current OpenMP thread on the GPU. - auto *ThreadID = getNVPTXThreadID(CGF); - // nvptx_lane_id = nvptx_id % warpsize - auto *LaneID = getNVPTXLaneID(CGF); - // nvptx_warp_id = nvptx_id / warpsize - auto *WarpID = getNVPTXWarpID(CGF); - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - unsigned Idx = 0; - for (auto &Private : Privates) { - // - // Warp master copies reduce element to transfer medium in __shared__ - // memory. - // - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - // if (lane_id == 0) - auto IsWarpMaster = - Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master"); - Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); - CGF.EmitBlock(ThenBB); - - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = (type[i]*)(elemptrptr) - Address ElemPtr = - Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); - ElemPtr = Bld.CreateElementBitCast( - ElemPtr, CGF.ConvertTypeForMem(Private->getType())); - // elem = *elemptr - llvm::Value *Elem = CGF.EmitLoadOfScalar( - ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); - - // Get pointer to location in transfer medium. - // MediumPtr = &medium[warp_id] - llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); - Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType())); - // Casting to actual data type. - // MediumPtr = (type[i]*)MediumPtrAddr; - MediumPtr = Bld.CreateElementBitCast( - MediumPtr, CGF.ConvertTypeForMem(Private->getType())); - - //*MediumPtr = elem - Bld.CreateStore(Elem, MediumPtr); - - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(MergeBB); - - Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); - llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( - AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation()); - - auto *NumActiveThreads = Bld.CreateNSWMul( - NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); - // named_barrier_sync(ParallelBarrierID, num_active_threads) - syncParallelThreads(CGF, NumActiveThreads); - - // - // Warp 0 copies reduce element from transfer medium. - // - llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); - - // Up to 32 threads in warp 0 are active. - auto IsActiveThread = - Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); - Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); - - CGF.EmitBlock(W0ThenBB); - - // SrcMediumPtr = &medium[tid] - llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); - Address SrcMediumPtr(SrcMediumPtrVal, - C.getTypeAlignInChars(Private->getType())); - // SrcMediumVal = *SrcMediumPtr; - SrcMediumPtr = Bld.CreateElementBitCast( - SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType())); - llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar( - SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); - - // TargetElemPtr = (type[i]*)(SrcDataAddr[i]) - Address TargetElemPtrPtr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( - TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - Address TargetElemPtr = - Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType())); - TargetElemPtr = Bld.CreateElementBitCast( - TargetElemPtr, CGF.ConvertTypeForMem(Private->getType())); - - // *TargetElemPtr = SrcMediumVal; - CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, - Private->getType()); - Bld.CreateBr(W0MergeBB); - - CGF.EmitBlock(W0ElseBB); - Bld.CreateBr(W0MergeBB); - - CGF.EmitBlock(W0MergeBB); - - // While warp 0 copies values from transfer medium, all other warps must - // wait. - syncParallelThreads(CGF, NumActiveThreads); - Idx++; - } - - CGF.FinishFunction(); - return Fn; -} - -/// Emit a helper that reduces data across two OpenMP threads (lanes) -/// in the same warp. It uses shuffle instructions to copy over data from -/// a remote lane's stack. The reduction algorithm performed is specified -/// by the fourth parameter. -/// -/// Algorithm Versions. -/// Full Warp Reduce (argument value 0): -/// This algorithm assumes that all 32 lanes are active and gathers -/// data from these 32 lanes, producing a single resultant value. -/// Contiguous Partial Warp Reduce (argument value 1): -/// This algorithm assumes that only a *contiguous* subset of lanes -/// are active. This happens for the last warp in a parallel region -/// when the user specified num_threads is not an integer multiple of -/// 32. This contiguous subset always starts with the zeroth lane. -/// Partial Warp Reduce (argument value 2): -/// This algorithm gathers data from any number of lanes at any position. -/// All reduced values are stored in the lowest possible lane. The set -/// of problems every algorithm addresses is a super set of those -/// addressable by algorithms with a lower version number. Overhead -/// increases as algorithm version increases. -/// -/// Terminology -/// Reduce element: -/// Reduce element refers to the individual data field with primitive -/// data types to be combined and reduced across threads. -/// Reduce list: -/// Reduce list refers to a collection of local, thread-private -/// reduce elements. -/// Remote Reduce list: -/// Remote Reduce list refers to a collection of remote (relative to -/// the current thread) reduce elements. -/// -/// We distinguish between three states of threads that are important to -/// the implementation of this function. -/// Alive threads: -/// Threads in a warp executing the SIMT instruction, as distinguished from -/// threads that are inactive due to divergent control flow. -/// Active threads: -/// The minimal set of threads that has to be alive upon entry to this -/// function. The computation is correct iff active threads are alive. -/// Some threads are alive but they are not active because they do not -/// contribute to the computation in any useful manner. Turning them off -/// may introduce control flow overheads without any tangible benefits. -/// Effective threads: -/// In order to comply with the argument requirements of the shuffle -/// function, we must keep all lanes holding data alive. But at most -/// half of them perform value aggregation; we refer to this half of -/// threads as effective. The other half is simply handing off their -/// data. -/// -/// Procedure -/// Value shuffle: -/// In this step active threads transfer data from higher lane positions -/// in the warp to lower lane positions, creating Remote Reduce list. -/// Value aggregation: -/// In this step, effective threads combine their thread local Reduce list -/// with Remote Reduce list and store the result in the thread local -/// Reduce list. -/// Value copy: -/// In this step, we deal with the assumption made by algorithm 2 -/// (i.e. contiguity assumption). When we have an odd number of lanes -/// active, say 2k+1, only k threads will be effective and therefore k -/// new values will be produced. However, the Reduce list owned by the -/// (2k+1)th thread is ignored in the value aggregation. Therefore -/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so -/// that the contiguity assumption still holds. -static llvm::Value * -emitShuffleAndReduceFunction(CodeGenModule &CGM, - ArrayRef<const Expr *> Privates, - QualType ReductionArrayTy, llvm::Value *ReduceFn) { - auto &C = CGM.getContext(); - - // Thread local Reduce list used to host the values of data to be reduced. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.VoidPtrTy); - // Current lane id; could be logical. - ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.ShortTy); - // Offset of the remote source lane relative to the current lane. - ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.ShortTy); - // Algorithm version. This is expected to be known at compile time. - ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.ShortTy); - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&LaneIDArg); - Args.push_back(&RemoteLaneOffsetArg); - Args.push_back(&AlgoVerArg); - - auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); - CodeGenFunction CGF(CGM); - // We don't need debug information in this function as nothing here refers to - // user code. - CGF.disableDebugInfo(); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); - - auto &Bld = CGF.Builder; - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); - llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( - AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); - llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( - AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); - llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( - AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - // Create a local thread-private variable to host the Reduce list - // from a remote lane. - Address RemoteReduceList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); - - // This loop iterates through the list of reduce elements and copies, - // element by element, from a remote lane in the warp to RemoteReduceList, - // hosted on the thread's stack. - emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, - LocalReduceList, RemoteReduceList, - RemoteLaneOffsetArgVal); - - // The actions to be performed on the Remote Reduce list is dependent - // on the algorithm version. - // - // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && - // LaneId % 2 == 0 && Offset > 0): - // do the reduction value aggregation - // - // The thread local variable Reduce list is mutated in place to host the - // reduced data, which is the aggregated value produced from local and - // remote lanes. - // - // Note that AlgoVer is expected to be a constant integer known at compile - // time. - // When AlgoVer==0, the first conjunction evaluates to true, making - // the entire predicate true during compile time. - // When AlgoVer==1, the second conjunction has only the second part to be - // evaluated during runtime. Other conjunctions evaluates to false - // during compile time. - // When AlgoVer==2, the third conjunction has only the second part to be - // evaluated during runtime. Other conjunctions evaluates to false - // during compile time. - auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0)); - - auto CondAlgo1 = - Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), - Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); - - auto CondAlgo2 = Bld.CreateAnd( - Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)), - Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)), - Bld.getInt16(0))); - CondAlgo2 = Bld.CreateAnd( - CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); - - auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); - CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); - - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); - - CGF.EmitBlock(ThenBB); - // reduce_function(LocalReduceList, RemoteReduceList) - llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - LocalReduceList.getPointer(), CGF.VoidPtrTy); - llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - RemoteReduceList.getPointer(), CGF.VoidPtrTy); - CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(MergeBB); - - // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local - // Reduce list. - auto CondCopy = - Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), - Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); - - llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); - Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); - - CGF.EmitBlock(CpyThenBB); - emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, - RemoteReduceList, LocalReduceList); - Bld.CreateBr(CpyMergeBB); - - CGF.EmitBlock(CpyElseBB); - Bld.CreateBr(CpyMergeBB); - - CGF.EmitBlock(CpyMergeBB); - - CGF.FinishFunction(); - return Fn; -} - -/// -/// Design of OpenMP reductions on the GPU -/// -/// Consider a typical OpenMP program with one or more reduction -/// clauses: -/// -/// float foo; -/// double bar; -/// #pragma omp target teams distribute parallel for \ -/// reduction(+:foo) reduction(*:bar) -/// for (int i = 0; i < N; i++) { -/// foo += A[i]; bar *= B[i]; -/// } -/// -/// where 'foo' and 'bar' are reduced across all OpenMP threads in -/// all teams. In our OpenMP implementation on the NVPTX device an -/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads -/// within a team are mapped to CUDA threads within a threadblock. -/// Our goal is to efficiently aggregate values across all OpenMP -/// threads such that: -/// -/// - the compiler and runtime are logically concise, and -/// - the reduction is performed efficiently in a hierarchical -/// manner as follows: within OpenMP threads in the same warp, -/// across warps in a threadblock, and finally across teams on -/// the NVPTX device. -/// -/// Introduction to Decoupling -/// -/// We would like to decouple the compiler and the runtime so that the -/// latter is ignorant of the reduction variables (number, data types) -/// and the reduction operators. This allows a simpler interface -/// and implementation while still attaining good performance. -/// -/// Pseudocode for the aforementioned OpenMP program generated by the -/// compiler is as follows: -/// -/// 1. Create private copies of reduction variables on each OpenMP -/// thread: 'foo_private', 'bar_private' -/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned -/// to it and writes the result in 'foo_private' and 'bar_private' -/// respectively. -/// 3. Call the OpenMP runtime on the GPU to reduce within a team -/// and store the result on the team master: -/// -/// __kmpc_nvptx_parallel_reduce_nowait(..., -/// reduceData, shuffleReduceFn, interWarpCpyFn) -/// -/// where: -/// struct ReduceData { -/// double *foo; -/// double *bar; -/// } reduceData -/// reduceData.foo = &foo_private -/// reduceData.bar = &bar_private -/// -/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two -/// auxiliary functions generated by the compiler that operate on -/// variables of type 'ReduceData'. They aid the runtime perform -/// algorithmic steps in a data agnostic manner. -/// -/// 'shuffleReduceFn' is a pointer to a function that reduces data -/// of type 'ReduceData' across two OpenMP threads (lanes) in the -/// same warp. It takes the following arguments as input: -/// -/// a. variable of type 'ReduceData' on the calling lane, -/// b. its lane_id, -/// c. an offset relative to the current lane_id to generate a -/// remote_lane_id. The remote lane contains the second -/// variable of type 'ReduceData' that is to be reduced. -/// d. an algorithm version parameter determining which reduction -/// algorithm to use. -/// -/// 'shuffleReduceFn' retrieves data from the remote lane using -/// efficient GPU shuffle intrinsics and reduces, using the -/// algorithm specified by the 4th parameter, the two operands -/// element-wise. The result is written to the first operand. -/// -/// Different reduction algorithms are implemented in different -/// runtime functions, all calling 'shuffleReduceFn' to perform -/// the essential reduction step. Therefore, based on the 4th -/// parameter, this function behaves slightly differently to -/// cooperate with the runtime to ensure correctness under -/// different circumstances. -/// -/// 'InterWarpCpyFn' is a pointer to a function that transfers -/// reduced variables across warps. It tunnels, through CUDA -/// shared memory, the thread-private data of type 'ReduceData' -/// from lane 0 of each warp to a lane in the first warp. -/// 5. if ret == 1: -/// The team master of the last team stores the reduced -/// result to the globals in memory. -/// foo += reduceData.foo; bar *= reduceData.bar -/// -/// -/// Warp Reduction Algorithms -/// -/// On the warp level, we have three algorithms implemented in the -/// OpenMP runtime depending on the number of active lanes: -/// -/// Full Warp Reduction -/// -/// The reduce algorithm within a warp where all lanes are active -/// is implemented in the runtime as follows: -/// -/// full_warp_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { -/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) -/// ShuffleReduceFn(reduce_data, 0, offset, 0); -/// } -/// -/// The algorithm completes in log(2, WARPSIZE) steps. -/// -/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is -/// not used therefore we save instructions by not retrieving lane_id -/// from the corresponding special registers. The 4th parameter, which -/// represents the version of the algorithm being used, is set to 0 to -/// signify full warp reduction. -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// #reduce_elem refers to an element in the local lane's data structure -/// #remote_elem is retrieved from a remote lane -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// reduce_elem = reduce_elem REDUCE_OP remote_elem; -/// -/// Contiguous Partial Warp Reduction -/// -/// This reduce algorithm is used within a warp where only the first -/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the -/// number of OpenMP threads in a parallel region is not a multiple of -/// WARPSIZE. The algorithm is implemented in the runtime as follows: -/// -/// void -/// contiguous_partial_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn, -/// int size, int lane_id) { -/// int curr_size; -/// int offset; -/// curr_size = size; -/// mask = curr_size/2; -/// while (offset>0) { -/// ShuffleReduceFn(reduce_data, lane_id, offset, 1); -/// curr_size = (curr_size+1)/2; -/// offset = curr_size/2; -/// } -/// } -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// if (lane_id < offset) -/// reduce_elem = reduce_elem REDUCE_OP remote_elem -/// else -/// reduce_elem = remote_elem -/// -/// This algorithm assumes that the data to be reduced are located in a -/// contiguous subset of lanes starting from the first. When there is -/// an odd number of active lanes, the data in the last lane is not -/// aggregated with any other lane's dat but is instead copied over. -/// -/// Dispersed Partial Warp Reduction -/// -/// This algorithm is used within a warp when any discontiguous subset of -/// lanes are active. It is used to implement the reduction operation -/// across lanes in an OpenMP simd region or in a nested parallel region. -/// -/// void -/// dispersed_partial_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { -/// int size, remote_id; -/// int logical_lane_id = number_of_active_lanes_before_me() * 2; -/// do { -/// remote_id = next_active_lane_id_right_after_me(); -/// # the above function returns 0 of no active lane -/// # is present right after the current lane. -/// size = number_of_active_lanes_in_this_warp(); -/// logical_lane_id /= 2; -/// ShuffleReduceFn(reduce_data, logical_lane_id, -/// remote_id-1-threadIdx.x, 2); -/// } while (logical_lane_id % 2 == 0 && size > 1); -/// } -/// -/// There is no assumption made about the initial state of the reduction. -/// Any number of lanes (>=1) could be active at any position. The reduction -/// result is returned in the first active lane. -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// if (lane_id % 2 == 0 && offset > 0) -/// reduce_elem = reduce_elem REDUCE_OP remote_elem -/// else -/// reduce_elem = remote_elem -/// -/// -/// Intra-Team Reduction -/// -/// This function, as implemented in the runtime call -/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP -/// threads in a team. It first reduces within a warp using the -/// aforementioned algorithms. We then proceed to gather all such -/// reduced values at the first warp. -/// -/// The runtime makes use of the function 'InterWarpCpyFn', which copies -/// data from each of the "warp master" (zeroth lane of each warp, where -/// warp-reduced data is held) to the zeroth warp. This step reduces (in -/// a mathematical sense) the problem of reduction across warp masters in -/// a block to the problem of warp reduction. -/// -void CGOpenMPRuntimeNVPTX::emitReduction( - CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, - ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, - ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { - if (!CGF.HaveInsertPoint()) - return; - - bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); - assert(ParallelReduction && "Invalid reduction selection in emitReduction."); - - auto &C = CGM.getContext(); - - // 1. Build a list of reduction variables. - // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; - auto Size = RHSExprs.size(); - for (auto *E : Privates) { - if (E->getType()->isVariablyModifiedType()) - // Reserve place for array size. - ++Size; - } - llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, - /*IndexTypeQuals=*/0); - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - unsigned Idx = 0; - for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - CGF.Builder.CreateStore( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), - Elem); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .first, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } - } - - // 2. Emit reduce_func(). - auto *ReductionFn = emitReductionFunction( - CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, - LHSExprs, RHSExprs, ReductionOps); - - // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), - // RedList, shuffle_reduce_func, interwarp_copy_func); - auto *ThreadId = getThreadID(CGF, Loc); - auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); - auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - ReductionList.getPointer(), CGF.VoidPtrTy); - - auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction( - CGM, Privates, ReductionArrayTy, ReductionFn); - auto *InterWarpCopyFn = - emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy); - - llvm::Value *Res = nullptr; - if (ParallelReduction) { - llvm::Value *Args[] = {ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn}; - - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), - Args); - } - - // 5. Build switch(res) - auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); - auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); - - // 6. Build case 1: where we have reduced values in the master - // thread in each team. - // __kmpc_end_reduce{_nowait}(<gtid>); - // break; - auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); - SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); - CGF.EmitBlock(Case1BB); - - // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); - llvm::Value *EndArgs[] = {ThreadId}; - auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps, - this](CodeGenFunction &CGF, PrePostActionTy &Action) { - auto IPriv = Privates.begin(); - auto ILHS = LHSExprs.begin(); - auto IRHS = RHSExprs.begin(); - for (auto *E : ReductionOps) { - emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), - cast<DeclRefExpr>(*IRHS)); - ++IPriv; - ++ILHS; - ++IRHS; - } - }; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - CGF.EmitBranch(DefaultBB); - CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); -} Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h (original) +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h Thu Feb 16 08:25:35 2017 @@ -67,6 +67,12 @@ private: /// \brief Signal termination of Spmd mode execution. void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// \brief Returns specified OpenMP runtime function for the current OpenMP + /// implementation. Specialized for the NVPTX device. + /// \param Function OpenMP runtime function. + /// \return Specified function. + llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); + // // Base class overrides. // @@ -242,32 +248,7 @@ public: ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) override; - /// Emit a code for reduction clause. - /// - /// \param Privates List of private copies for original reduction arguments. - /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. - /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. - /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' - /// or 'operator binop(LHS, RHS)'. - /// \param Options List of options for reduction codegen: - /// WithNowait true if parent directive has also nowait clause, false - /// otherwise. - /// SimpleReduction Emit reduction operation only. Used for omp simd - /// directive on the host. - /// ReductionKind The kind of reduction to perform. - virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef<const Expr *> Privates, - ArrayRef<const Expr *> LHSExprs, - ArrayRef<const Expr *> RHSExprs, - ArrayRef<const Expr *> ReductionOps, - ReductionOptionsTy Options) override; - - /// Returns specified OpenMP runtime function for the current OpenMP - /// implementation. Specialized for the NVPTX device. - /// \param Function OpenMP runtime function. - /// \return Specified function. - llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); - +public: /// Target codegen is specialized based on two programming models: the /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' /// model for constructs like 'target parallel' that support it. Modified: cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp (original) +++ cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp Thu Feb 16 08:25:35 2017 @@ -1190,7 +1190,7 @@ void CodeGenFunction::EmitOMPReductionCl } void CodeGenFunction::EmitOMPReductionClauseFinal( - const OMPExecutableDirective &D, const OpenMPDirectiveKind ReductionKind) { + const OMPExecutableDirective &D) { if (!HaveInsertPoint()) return; llvm::SmallVector<const Expr *, 8> Privates; @@ -1206,15 +1206,14 @@ void CodeGenFunction::EmitOMPReductionCl ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); } if (HasAtLeastOneReduction) { - bool WithNowait = D.getSingleClause<OMPNowaitClause>() || - isOpenMPParallelDirective(D.getDirectiveKind()) || - D.getDirectiveKind() == OMPD_simd; - bool SimpleReduction = D.getDirectiveKind() == OMPD_simd; // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( *this, D.getLocEnd(), Privates, LHSExprs, RHSExprs, ReductionOps, - {WithNowait, SimpleReduction, ReductionKind}); + D.getSingleClause<OMPNowaitClause>() || + isOpenMPParallelDirective(D.getDirectiveKind()) || + D.getDirectiveKind() == OMPD_simd, + D.getDirectiveKind() == OMPD_simd); } } @@ -1296,7 +1295,7 @@ void CodeGenFunction::EmitOMPParallelDir CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt()); - CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); + CGF.EmitOMPReductionClauseFinal(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen); emitPostUpdateForReductionClause( @@ -1709,7 +1708,7 @@ void CodeGenFunction::EmitOMPSimdDirecti // Emit final copy of the lastprivate variables at the end of loops. if (HasLastprivateClause) CGF.EmitOMPLastprivateClauseFinal(S, /*NoFinals=*/true); - CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_simd); + CGF.EmitOMPReductionClauseFinal(S); emitPostUpdateForReductionClause( CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } @@ -2245,10 +2244,7 @@ bool CodeGenFunction::EmitOMPWorksharing CGF.EmitLoadOfScalar(IL, S.getLocStart())); }); } - EmitOMPReductionClauseFinal( - S, /*ReductionKind=*/isOpenMPSimdDirective(S.getDirectiveKind()) - ? /*Parallel and Simd*/ OMPD_parallel_for_simd - : /*Parallel only*/ OMPD_parallel); + EmitOMPReductionClauseFinal(S); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( *this, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -2423,7 +2419,7 @@ void CodeGenFunction::EmitSections(const CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocEnd()); }; CGF.OMPCancelStack.emitExit(CGF, S.getDirectiveKind(), CodeGen); - CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); + CGF.EmitOMPReductionClauseFinal(S); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( CGF, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -3821,19 +3817,11 @@ static void emitTargetParallelRegion(Cod // Get the captured statement associated with the 'parallel' region. auto *CS = S.getCapturedStmt(OMPD_parallel); Action.Enter(CGF); - auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &) { - CodeGenFunction::OMPPrivateScope PrivateScope(CGF); - (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); - CGF.EmitOMPPrivateClause(S, PrivateScope); - CGF.EmitOMPReductionClauseInit(S, PrivateScope); - (void)PrivateScope.Privatize(); + auto &&CodeGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) { // TODO: Add support for clauses. CGF.EmitStmt(CS->getCapturedStmt()); - CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); }; emitCommonOMPParallelDirective(CGF, S, OMPD_parallel, CodeGen); - emitPostUpdateForReductionClause( - CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } void CodeGenFunction::EmitOMPTargetParallelDeviceFunction( Modified: cfe/trunk/lib/CodeGen/CodeGenFunction.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenFunction.h?rev=295323&r1=295322&r2=295323&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CodeGenFunction.h (original) +++ cfe/trunk/lib/CodeGen/CodeGenFunction.h Thu Feb 16 08:25:35 2017 @@ -2638,9 +2638,7 @@ public: /// the end of the directive. /// /// \param D Directive that has at least one 'reduction' directives. - /// \param ReductionKind The kind of reduction to perform. - void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D, - const OpenMPDirectiveKind ReductionKind); + void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D); /// \brief Emit initial code for linear variables. Creates private copies /// and initializes them with the values according to OpenMP standard. /// Removed: cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp?rev=295322&view=auto ============================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp (original) +++ cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp (removed) @@ -1,830 +0,0 @@ -// Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// expected-no-diagnostics -#ifndef HEADER -#define HEADER - -// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. -// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] - -// Check that the execution mode of all 3 target regions is set to Spmd Mode. -// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 - -template<typename tx> -tx ftemplate(int n) { - int a; - short b; - tx c; - float d; - double e; - - #pragma omp target parallel reduction(+: e) map(tofrom: e) - { - e += 5; - } - - #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d) - { - c ^= 2; - d *= 33; - } - - #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b) - { - a |= 1; - b = 99 > b ? 99 : b; - } - - return a+b+c+d+e; -} - -int bar(int n){ - int a = 0; - - a += ftemplate<char>(n); - - return a; -} - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit() - // - // - // define internal void [[PFN]]( - // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 - // CHECK: store double [[ADD]], double* [[E]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* - // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] - - // CHECK: [[REDLABEL]] - // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align - // CHECK: [[EV:%.+]] = load double, double* [[E]], align - // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] - // CHECK: store double [[ADD]], double* [[E_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] - // - // CHECK: [[DEFAULTLABEL]] - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], - // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], - // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], - // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], - // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] - // CHECK: store double [[RES]], double* [[VAR_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT:%.+]] = alloca double - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double - // - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align - // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* - // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* - // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: ret - - - - - - - - - - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit() - // - // - // define internal void [[PFN1]]( - // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align - // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align - // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} - // CHECK: store float [[MUL]], float* [[D]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: store i8* [[C]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* - // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] - - // CHECK: [[REDLABEL]] - // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align - // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 - // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align - // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align - // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] - // CHECK: store float [[MUL]], float* [[D_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] - // - // CHECK: [[DEFAULTLABEL]] - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* - // - // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], - // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 - // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], - // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: store float [[RES]], float* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca float - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 - // - // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align - // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float - // - // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* - // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: ret - - - - - - - - - - - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: br label {{%?}}[[EXECUTE:.+]] - // - // CHECK: [[EXECUTE]] - // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32* - // CHECK: call void @__kmpc_spmd_kernel_deinit() - // - // - // define internal void [[PFN2]]( - // CHECK: store i32 0, i32* [[A:%.+]], align - // CHECK: store i16 -32768, i16* [[B:%.+]], align - // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align - // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 - // CHECK: store i32 [[OR]], i32* [[A]], align - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align - // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 - // CHECK: store i16 [[TRUNC]], i16* [[B]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* - // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* - // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] - - // CHECK: [[REDLABEL]] - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] - // - // CHECK: [[DEFAULTLABEL]] - // CHECK: ret - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* - // - // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], - // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], - // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 - // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 - // - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 - // - // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: ret - -#endif _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits