================ @@ -2051,36 +2057,1424 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc, /*IsCancellable*/ true); } -/// Create a function with a unique name and a "void (i8*, i8*)" signature in -/// the given module and return it. -Function *getFreshReductionFunc(Module &M) { +static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) { + BasicBlock::iterator IT(I); + IT++; + return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT); +} + +void OpenMPIRBuilder::emitUsed(StringRef Name, + std::vector<WeakTrackingVH> &List) { + if (List.empty()) + return; + + // Convert List to what ConstantArray needs. + SmallVector<Constant *, 8> UsedArray; + UsedArray.resize(List.size()); + for (unsigned I = 0, E = List.size(); I != E; ++I) + UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(&*List[I]), Builder.getPtrTy()); + + if (UsedArray.empty()) + return; + ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size()); + + auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, UsedArray), Name); + + GV->setSection("llvm.metadata"); +} + +Value *OpenMPIRBuilder::getGPUThreadID() { + return Builder.CreateCall( + getOrCreateRuntimeFunction(M, + OMPRTL___kmpc_get_hardware_thread_id_in_block), + {}); +} + +Value *OpenMPIRBuilder::getGPUWarpSize() { + return Builder.CreateCall( + getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {}); +} + +Value *OpenMPIRBuilder::getNVPTXWarpID() { + unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); + return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id"); +} + +Value *OpenMPIRBuilder::getNVPTXLaneID() { + unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); + assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device."); + unsigned LaneIDMask = ~0u >> (32u - LaneIDBits); + return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask), + "nvptx_lane_id"); +} + +Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From, + Type *ToType) { + Type *FromType = From->getType(); + uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType); + uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType); + assert(FromSize > 0 && "From size must be greater than zero"); + assert(ToSize > 0 && "To size must be greater than zero"); + if (FromType == ToType) + return From; + if (FromSize == ToSize) + return Builder.CreateBitCast(From, ToType); + if (ToType->isIntegerTy() && FromType->isIntegerTy()) + return Builder.CreateIntCast(From, ToType, /*isSigned*/ true); + InsertPointTy SaveIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + Value *CastItem = Builder.CreateAlloca(ToType); + Builder.restoreIP(SaveIP); + + Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast( + CastItem, FromType->getPointerTo()); + Builder.CreateStore(From, ValCastItem); + return Builder.CreateLoad(ToType, CastItem); +} + +Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, + Value *Element, + Type *ElementType, + Value *Offset) { + uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType); + assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction"); + + // Cast all types to 32- or 64-bit values before calling shuffle routines. + Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64); + Value *ElemCast = castValueToType(AllocaIP, Element, CastTy); + Value *WarpSize = + Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true); + Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr( + Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32 + : RuntimeFunction::OMPRTL___kmpc_shuffle_int64); + Value *WarpSizeCast = + Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true); + Value *ShuffleCall = + Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast}); + return castValueToType(AllocaIP, ShuffleCall, CastTy); +} + +void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, + Value *DstAddr, Type *ElemType, + Value *Offset, Type *ReductionArrayTy) { + uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); + // Create the loop over the big sized data. + // ptr = (void*)Elem; + // ptrEnd = (void*) Elem + 1; + // Step = 8; + // while (ptr + Step < ptrEnd) + // shuffle((int64_t)*ptr); + // Step = 4; + // while (ptr + Step < ptrEnd) + // shuffle((int32_t)*ptr); + // ... + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + Value *ElemPtr = DstAddr; + Value *Ptr = SrcAddr; + for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) { + if (Size < IntSize) + continue; + Type *IntType = Builder.getIntNTy(IntSize * 8); + Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast( + Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast"); + Value *SrcAddrGEP = + Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)}); + ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast"); + + Function *CurFunc = Builder.GetInsertBlock()->getParent(); + if ((Size / IntSize) > 1) { + Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast( + SrcAddrGEP, Builder.getPtrTy()); + BasicBlock *PreCondBB = + BasicBlock::Create(M.getContext(), ".shuffle.pre_cond"); + BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then"); + BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit"); + BasicBlock *CurrentBB = Builder.GetInsertBlock(); + emitBlock(PreCondBB, CurFunc); + PHINode *PhiSrc = + Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2); + PhiSrc->addIncoming(Ptr, CurrentBB); + PHINode *PhiDest = + Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2); + PhiDest->addIncoming(ElemPtr, CurrentBB); + Ptr = PhiSrc; + ElemPtr = PhiDest; + Value *PtrDiff = Builder.CreatePtrDiff( + Builder.getInt8Ty(), PtrEnd, + Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy())); + Builder.CreateCondBr( + Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB, + ExitBB); + emitBlock(ThenBB, CurFunc); + Value *Res = createRuntimeShuffleFunction( + AllocaIP, + Builder.CreateAlignedLoad( + IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)), + IntType, Offset); + Builder.CreateAlignedStore(Res, ElemPtr, + M.getDataLayout().getPrefTypeAlign(ElemType)); + Value *LocalPtr = + Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); + Value *LocalElemPtr = + Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); + PhiSrc->addIncoming(LocalPtr, ThenBB); + PhiDest->addIncoming(LocalElemPtr, ThenBB); + emitBranch(PreCondBB); + emitBlock(ExitBB, CurFunc); + } else { + Value *Res = createRuntimeShuffleFunction( + AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset); + if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() < + Res->getType()->getScalarSizeInBits()) + Res = Builder.CreateTrunc(Res, ElemType); + Builder.CreateStore(Res, ElemPtr); + Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); + ElemPtr = + Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); + } + Size = Size % IntSize; + } +} + +void OpenMPIRBuilder::emitReductionListCopy( + InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, + ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, + CopyOptionsTy CopyOptions) { + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; + + // Iterates, element-by-element, through the source Reduce list and + // make a copy. + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *SrcElementAddr = nullptr; + Value *DestElementAddr = nullptr; + Value *DestElementPtrAddr = nullptr; + // Should we shuffle in an element from a remote lane? + bool ShuffleInElement = false; + // Set to true to update the pointer in the dest Reduce list to a + // newly created element. + bool UpdateDestListPtr = false; + + // Step 1.1: Get the address for the src element in the Reduce list. + Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP( + ReductionArrayTy, SrcBase, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr); + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = Builder.CreateInBoundsGEP( + ReductionArrayTy, DestBase, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + switch (Action) { + case CopyAction::RemoteLaneToThread: { + InsertPointTy CurIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, + ".omp.reduction.element"); + DestAlloca->setAlignment( + M.getDataLayout().getPrefTypeAlign(RI.ElementType)); + DestElementAddr = DestAlloca; + DestElementAddr = + Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), + DestElementAddr->getName() + ".ascast"); + Builder.restoreIP(CurIP); + ShuffleInElement = true; + UpdateDestListPtr = true; + break; + } + case CopyAction::ThreadCopy: { + DestElementAddr = + Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr); + break; + } + } + + // Now that all active lanes have read the element in the + // Reduce list, shuffle over the value from the remote lane. + if (ShuffleInElement) { + shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, + RemoteLaneOffset, ReductionArrayTy); + } else { + switch (RI.EvaluationKind) { + case EvaluationKindTy::Scalar: { + Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr); + // Store the source element value to the dest element address. + Builder.CreateStore(Elem, DestElementAddr); + break; + } + case EvaluationKindTy::Complex: { + assert(false && "Complex data type not handled"); + break; + } ---------------- TIFitis wrote:
@jdoerfert Currently there are no test cases for the Complex kind. I was trying to come up with test cases that might fall under this case but I wasn't able to. Please let me know if this case is infeasible, and we can leave it as an assertion failure. Here's my attempt at creating a test case, but it still falls under the aggregate kind: ``` int foo() { int i; int j; std::complex<int> sum[10][10]; std::complex<int> res; #pragma omp target teams loop reduction(+:sum) for(i=0; i<10; i++) for(j=0; j<10; j++) res += sum[i][j]; return 0; } ``` https://github.com/llvm/llvm-project/pull/80343 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits