ychen created this revision. ychen added reviewers: rjmccall, lxfind, ChuanqiXu. Herald added a subscriber: hiraditya. ychen requested review of this revision. Herald added subscribers: llvm-commits, cfe-commits, jdoerfert. Herald added projects: clang, LLVM.
This is an alternative to D97915 <https://reviews.llvm.org/D97915> which missed proper deallocation of the over-allocated frame. This patch handles both allocations and deallocations. Contrary to D97915 <https://reviews.llvm.org/D97915>, this patch implements the over-allocation in the backend instead of the frontend since - The alloca of the raw frame pointer (suppose we insert it in the frontend) would be included in the non-overaligned frame if we don't teach CoroFrame how to elide it. - Only insert extra code when it is known that the frame is overaligned. - Simpler implementation. - Clients could turn it on/off by setting the newly introduced argument of `llvm.coro.size(i1 alloc)`. `llvm.coro.size` gains it first argument `i1 alloc` to indicate users of the intrinsic are alloc/dealloc functions. It also indicates to LLVM that it should handle overaligned frames. One con of doing this is that many tests need fixup (if the general direction is agreed upon, I'll do that later). It gets a little tricky finding the deallocation function. It is assumed that all CallInst users of `llvm.coro.free` are potentially deallocation functions. I think this should be the implicit assumption in practice although the documentation `Coroutines.rst` does not mention that. Happy to better idea in this regard. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D100739 Files: clang/docs/LanguageExtensions.rst clang/include/clang/Basic/Builtins.def clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaCoroutine.cpp llvm/docs/Coroutines.rst llvm/include/llvm/IR/Intrinsics.td llvm/lib/Transforms/Coroutines/CoroFrame.cpp llvm/lib/Transforms/Coroutines/CoroInstr.h llvm/lib/Transforms/Coroutines/CoroInternal.h llvm/lib/Transforms/Coroutines/CoroSplit.cpp llvm/lib/Transforms/Coroutines/Coroutines.cpp llvm/test/Transforms/Coroutines/ArgAddr.ll llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll llvm/test/Transforms/Coroutines/coro-alloca-01.ll
Index: llvm/test/Transforms/Coroutines/coro-alloca-01.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-alloca-01.ll +++ llvm/test/Transforms/Coroutines/coro-alloca-01.ll @@ -8,7 +8,7 @@ %x = alloca i64 %y = alloca i64 %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) - %size = call i32 @llvm.coro.size.i32() + %size = call i32 @llvm.coro.size.i32(i1 false) %alloc = call i8* @malloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) br i1 %n, label %flag_true, label %flag_false Index: llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll +++ llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll @@ -9,8 +9,8 @@ %this.addr = alloca i64 store i64 %this_arg, i64* %this.addr %this = load i64, i64* %this.addr - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) - %size = call i32 @llvm.coro.size.i32() + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32(i1 true) %alloc = call i8* @myAlloc(i64 %this, i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) %0 = call i8 @llvm.coro.suspend(token none, i1 false) @@ -45,7 +45,7 @@ ; CHECK: ret void declare i8* @llvm.coro.free(token, i8*) -declare i32 @llvm.coro.size.i32() +declare i32 @llvm.coro.size.i32(i1) declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*) declare void @llvm.coro.destroy(i8*) Index: llvm/test/Transforms/Coroutines/ArgAddr.ll =================================================================== --- llvm/test/Transforms/Coroutines/ArgAddr.ll +++ llvm/test/Transforms/Coroutines/ArgAddr.ll @@ -21,10 +21,10 @@ ; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4 ; entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null); + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null); %n.addr = alloca i32 store i32 %n, i32* %n.addr ; this needs to go after coro.begin - %0 = tail call i32 @llvm.coro.size.i32() + %0 = tail call i32 @llvm.coro.size.i32(i1 true) %call = tail call i8* @malloc(i32 %0) %1 = tail call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %call) %2 = bitcast i32* %n.addr to i8* @@ -69,7 +69,7 @@ declare void @ctor(i8* nocapture readonly) declare token @llvm.coro.id(i32, i8*, i8*, i8*) -declare i32 @llvm.coro.size.i32() +declare i32 @llvm.coro.size.i32(i1) declare i8* @llvm.coro.begin(token, i8*) declare i8 @llvm.coro.suspend(token, i1) declare i8* @llvm.coro.free(token, i8*) Index: llvm/lib/Transforms/Coroutines/Coroutines.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -234,6 +234,7 @@ Shape.CoroBegin = nullptr; Shape.CoroEnds.clear(); Shape.CoroSizes.clear(); + Shape.CoroAligns.clear(); Shape.CoroSuspends.clear(); Shape.FrameTy = nullptr; @@ -267,6 +268,10 @@ continue; case Intrinsic::coro_size: CoroSizes.push_back(cast<CoroSizeInst>(II)); + HandleOverAlign = HandleOverAlign || CoroSizes.back()->isAlloc(); + break; + case Intrinsic::coro_align: + CoroAligns.push_back(cast<CoroAlignInst>(II)); break; case Intrinsic::coro_frame: CoroFrames.push_back(cast<CoroFrameInst>(II)); Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -69,6 +69,7 @@ #include <cstdint> #include <initializer_list> #include <iterator> +#include <stdint.h> using namespace llvm; @@ -997,23 +998,39 @@ Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct); } -static void replaceFrameSize(coro::Shape &Shape) { +static void replaceFrameSizeAndAlign(coro::Shape &Shape) { if (Shape.ABI == coro::ABI::Async) updateAsyncFuncPointerContextSize(Shape); - if (Shape.CoroSizes.empty()) - return; + if (!Shape.CoroSizes.empty()) { + // In the same function all coro.sizes should have the same result type. + auto *SizeIntrin = Shape.CoroSizes.back(); + Module *M = SizeIntrin->getModule(); + const DataLayout &DL = M->getDataLayout(); + auto Size = DL.getTypeAllocSize(Shape.FrameTy); - // In the same function all coro.sizes should have the same result type. - auto *SizeIntrin = Shape.CoroSizes.back(); - Module *M = SizeIntrin->getModule(); - const DataLayout &DL = M->getDataLayout(); - auto Size = DL.getTypeAllocSize(Shape.FrameTy); - auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size); + uint64_t FrameAlign = Shape.FrameAlign.value(); + uint64_t NewAlign = Shape.getSwitchCoroId()->getAlignment(); + uint64_t Extra = FrameAlign > NewAlign ? FrameAlign - NewAlign : 0; - for (CoroSizeInst *CS : Shape.CoroSizes) { - CS->replaceAllUsesWith(SizeConstant); - CS->eraseFromParent(); + for (CoroSizeInst *CS : Shape.CoroSizes) { + // Over allocate if needed. + uint64_t FrameSize = Size + (Shape.HandleOverAlign ? Extra : 0); + auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), FrameSize); + CS->replaceAllUsesWith(SizeConstant); + CS->eraseFromParent(); + } + } + + if (!Shape.CoroAligns.empty()) { + auto *Intrin = Shape.CoroAligns.back(); + auto *AlignConstant = + ConstantInt::get(Intrin->getType(), Shape.FrameAlign.value()); + + for (CoroAlignInst *CS : Shape.CoroAligns) { + CS->replaceAllUsesWith(AlignConstant); + CS->eraseFromParent(); + } } } @@ -1748,7 +1765,7 @@ simplifySuspendPoints(Shape); buildCoroutineFrame(F, Shape); - replaceFrameSize(Shape); + replaceFrameSizeAndAlign(Shape); // If there are no suspend points, no split required, just remove // the allocation and deallocation blocks, they are not needed. Index: llvm/lib/Transforms/Coroutines/CoroInternal.h =================================================================== --- llvm/lib/Transforms/Coroutines/CoroInternal.h +++ llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -99,6 +99,7 @@ CoroBeginInst *CoroBegin; SmallVector<AnyCoroEndInst *, 4> CoroEnds; SmallVector<CoroSizeInst *, 2> CoroSizes; + SmallVector<CoroAlignInst *, 2> CoroAligns; SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends; SmallVector<CallInst*, 2> SwiftErrorOps; @@ -124,8 +125,8 @@ uint64_t FrameSize; Instruction *FramePtr; BasicBlock *AllocaSpillBlock; - - bool ReuseFrameSlot; + bool HandleOverAlign = false; + bool ReuseFrameSlot = false; struct SwitchLoweringStorage { SwitchInst *ResumeSwitch; @@ -269,7 +270,7 @@ void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const; Shape() = default; - explicit Shape(Function &F, bool ReuseFrameSlot = false) + explicit Shape(Function &F, bool ReuseFrameSlot) : ReuseFrameSlot(ReuseFrameSlot) { buildFrom(F); } Index: llvm/lib/Transforms/Coroutines/CoroInstr.h =================================================================== --- llvm/lib/Transforms/Coroutines/CoroInstr.h +++ llvm/lib/Transforms/Coroutines/CoroInstr.h @@ -121,6 +121,10 @@ : cast<AllocaInst>(Arg->stripPointerCasts()); } + unsigned getAlignment() const { + return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue(); + } + void clearPromise() { Value *Arg = getArgOperand(PromiseArg); setArgOperand(PromiseArg, @@ -402,9 +406,9 @@ /// This represents the llvm.coro.free instruction. class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst { +public: enum { IdArg, FrameArg }; -public: Value *getFrame() const { return getArgOperand(FrameArg); } // Methods to support type inquiry through isa, cast, and dyn_cast: @@ -589,7 +593,12 @@ /// This represents the llvm.coro.size instruction. class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { + enum { AllocArg }; + public: + bool isAlloc() const { + return cast<Constant>(getArgOperand(0))->isOneValue(); + } // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_size; @@ -599,6 +608,18 @@ } }; +/// This represents the llvm.coro.align instruction. +class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_align; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { enum { FrameArg, UnwindArg }; Index: llvm/lib/Transforms/Coroutines/CoroFrame.cpp =================================================================== --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -14,17 +14,21 @@ // the value into the coroutine frame. //===----------------------------------------------------------------------===// +#include "CoroInstr.h" #include "CoroInternal.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/StackLifetime.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -486,6 +490,8 @@ return StructAlign; } + SmallVector<Field, 8> &getFields() { return Fields; } + FieldIDType getLayoutFieldIndex(FieldIDType Id) const { assert(IsFinished && "not yet finished!"); return Fields[Id].LayoutFieldIndex; @@ -710,6 +716,77 @@ IsFinished = true; } +// Adapted from CodeGenFunction::EmitBuiltinAlignTo. +static Value *emitAlignUpTo(IRBuilder<> &Builder, Value *Src, uint64_t Align) { + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + + auto *SrcType = cast<PointerType>(Src->getType()); + IntegerType *IntType = IntegerType::get(Builder.getContext(), + DL.getIndexTypeSizeInBits(SrcType)); + Value *Alignment = ConstantInt::get(IntType, Align); + auto *One = ConstantInt::get(IntType, 1); + Value *Mask = Builder.CreateSub(Alignment, One, "mask"); + Value *SrcAddr = Builder.CreatePtrToInt(Src, IntType, "intptr"); + + // When aligning up we have to first add the mask to ensure we go over the + // next alignment value and then align down to the next valid multiple. + // By adding the mask, we ensure that align_up on an already aligned + // value will not change the value. + Value *SrcForMask = Builder.CreateAdd(SrcAddr, Mask, "over_boundary"); + + // Invert the mask to only clear the lower bits. + Value *InvertedMask = Builder.CreateNot(Mask, "inverted_mask"); + Value *Result = Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result"); + + Result->setName("aligned_intptr"); + Value *Difference = Builder.CreateSub(Result, SrcAddr, "diff"); + // The result must point to the same underlying allocation. This means we + // can use an inbounds GEP to enable better optimization. + + PointerType *DestType = Builder.getInt8PtrTy(); + if (unsigned AddrSpace = SrcType->getAddressSpace()) + DestType = Type::getInt8PtrTy(Builder.getContext(), AddrSpace); + + Value *Base = Src; + if (SrcType != DestType) + Base = Builder.CreateBitCast(Src, DestType); + + // Out-of-bound case could not happen. + Result = Builder.CreateGEP(Base, Difference, "aligned_result"); + Result = Builder.CreatePointerCast(Result, SrcType); + + Type *IntPtrTy = Builder.getIntPtrTy(DL); + if (Alignment->getType() != IntPtrTy) + Alignment = + Builder.CreateIntCast(Alignment, IntPtrTy, false, "casted.align"); + (void)Builder.CreateAlignmentAssumption(DL, Result, Alignment); + assert(Result->getType() == SrcType); + return Result; +} + +static void replaceCoroFreeMemArg(CoroIdInst *CoroId, + AllocaInst *FramePtrAddr) { + SmallVector<CoroFreeInst *, 4> CoroFrees; + for (User *U : CoroId->users()) + if (auto *CF = dyn_cast<CoroFreeInst>(U)) + CoroFrees.push_back(CF); + + if (CoroFrees.empty()) + return; + + for (CoroFreeInst *CF : CoroFrees) { + // Assume CallInst users of `coro.free` are dealloc functions. + + // CF->setArgOperand(CoroFreeInst::FrameArg, FrameAddr); + for (Use &U : CF->uses()) + if (CallBase *CB = dyn_cast<CallBase>(U.getUser())) { + LoadInst *FrameAddr = new LoadInst(FramePtrAddr->getAllocatedType(), + FramePtrAddr, "raw.frame.ptr", CB); + CB->replaceUsesOfWith(CF, FrameAddr); + } + } +} + // Build a struct that will keep state for an active coroutine. // struct f.frame { // ResumeFnTy ResumeFnAddr; @@ -779,6 +856,41 @@ FrameData.setFieldIndex(S.first, Id); } + Align FrameAlign = + std::max_element( + B.getFields().begin(), B.getFields().end(), + [](auto &F1, auto &F2) { return F1.Alignment < F2.Alignment; }) + ->Alignment; + + // Check for over-alignment, if not, check FramePtrAlloc store could be + // optimized out after corosplit. + if (Shape.HandleOverAlign && + FrameAlign.value() > Shape.getSwitchCoroId()->getAlignment()) { + BasicBlock &Entry = F.getEntryBlock(); + IRBuilder<> Builder(&Entry, Entry.getFirstInsertionPt()); + + // Save to raw frame pointer to alloca + Value *Mem = Shape.CoroBegin->getMem(); + AllocaInst *FramePtrAddr = + Builder.CreateAlloca(Mem->getType(), nullptr, "alloc.frame.ptr"); + Builder.SetInsertPoint(Shape.CoroBegin); + Value *MockMem = Builder.CreatePointerCast(FramePtrAddr, Mem->getType()); + Builder.CreateStore(MockMem, FramePtrAddr); + + // Ajust frame pointer value. + Value *NewMem = emitAlignUpTo(Builder, MockMem, FrameAlign.value()); + Mem->replaceAllUsesWith(NewMem); + MockMem->replaceAllUsesWith(Mem); + cast<Instruction>(MockMem)->eraseFromParent(); + + // Replace all corofree second arg with raw frame pointer loaded from + // alloca. + replaceCoroFreeMemArg(Shape.getSwitchCoroId(), FramePtrAddr); + + // Add alloca to frame. + (void)B.addFieldForAlloca(FramePtrAddr); + } + B.finish(FrameTy); FrameData.updateLayoutIndex(B); Shape.FrameAlign = B.getStructAlign(); Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1236,7 +1236,8 @@ def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; -def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; +def int_coro_size : Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem]>; +def int_coro_align : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; def int_coro_save : Intrinsic<[llvm_token_ty], [llvm_ptr_ty], []>; def int_coro_suspend : Intrinsic<[llvm_i8_ty], [llvm_token_ty, llvm_i1_ty], []>; Index: llvm/docs/Coroutines.rst =================================================================== --- llvm/docs/Coroutines.rst +++ llvm/docs/Coroutines.rst @@ -927,8 +927,8 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare i32 @llvm.coro.size.i32() - declare i64 @llvm.coro.size.i64() + declare i32 @llvm.coro.size.i32(i1 <alloc>) + declare i64 @llvm.coro.size.i64(i1 <alloc>) Overview: """"""""" @@ -940,7 +940,10 @@ Arguments: """""""""" -None +The first argument is a boolean indicating that the retuned value is used by +alloc/dealloc functions. If it is true, overallocate may be performed to handle +over-alignment. All alloc/delloc functions should use ``llvm.coro.size`` with +the same ``alloc`` value. Semantics: """""""""" @@ -948,6 +951,32 @@ The `coro.size` intrinsic is lowered to a constant representing the size of the coroutine frame. +.. _coro.align: + +'llvm.coro.align' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + declare i32 @llvm.coro.align.i32() + declare i64 @llvm.coro.align.i64() + +Overview: +""""""""" + +The '``llvm.coro.align``' intrinsic returns the alignment of the coroutine frame +in bytes. This is only supported for switched-resume coroutines. + +Arguments: +"""""""""" + +None + +Semantics: +"""""""""" + +The `coro.align` intrinsic is lowered to a constant representing the alignment +of the coroutine frame. + .. _coro.begin: 'llvm.coro.begin' Intrinsic Index: clang/lib/Sema/SemaCoroutine.cpp =================================================================== --- clang/lib/Sema/SemaCoroutine.cpp +++ clang/lib/Sema/SemaCoroutine.cpp @@ -1375,8 +1375,12 @@ Expr *FramePtr = buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_frame, {}); + Expr *IsAlloc = IntegerLiteral::Create( + S.Context, llvm::APInt(32, 1), + S.Context.getIntTypeForBitwidth(32, /*Signed=*/0), Loc); + Expr *FrameSize = - buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_size, {}); + buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_size, {IsAlloc}); // Make new call. Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -4430,11 +4430,12 @@ return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E)); case Builtin::BI__builtin_coro_size: { + Value *Arg0 = EmitScalarExpr(E->getArg(0)); auto & Context = getContext(); auto SizeTy = Context.getSizeType(); auto T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); Function *F = CGM.getIntrinsic(Intrinsic::coro_size, T); - return RValue::get(Builder.CreateCall(F)); + return RValue::get(Builder.CreateCall(F, {Arg0})); } case Builtin::BI__builtin_coro_id: Index: clang/include/clang/Basic/Builtins.def =================================================================== --- clang/include/clang/Basic/Builtins.def +++ clang/include/clang/Basic/Builtins.def @@ -1573,7 +1573,7 @@ BUILTIN(__builtin_coro_done, "bv*", "n") BUILTIN(__builtin_coro_promise, "v*v*IiIb", "n") -BUILTIN(__builtin_coro_size, "z", "n") +BUILTIN(__builtin_coro_size, "zb", "n") BUILTIN(__builtin_coro_frame, "v*", "n") BUILTIN(__builtin_coro_noop, "v*", "n") BUILTIN(__builtin_coro_free, "v*v*", "n") Index: clang/docs/LanguageExtensions.rst =================================================================== --- clang/docs/LanguageExtensions.rst +++ clang/docs/LanguageExtensions.rst @@ -2686,7 +2686,7 @@ .. code-block:: c - size_t __builtin_coro_size() + size_t __builtin_coro_size(bool alloc) void *__builtin_coro_frame() void *__builtin_coro_free(void *coro_frame)
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits