asavonic created this revision. Herald added subscribers: danielkiss, kristof.beyls. asavonic requested review of this revision. Herald added a project: clang. Herald added a subscriber: cfe-commits.
If target ABI requires coercion to a larger type, higher bits of the resulting value are supposed to be undefined. However, before this patch Clang CG used to generate a `zext` instruction to coerce a value to a larger type, forcing higher bits to zero. This is problematic in some cases: struct st { int i; }; struct st foo(i); struct st bar(int x) { return foo(x); } For AArch64 Clang generates the following LLVM IR: define i64 @bar(i32 %x) { %call = call i64 @foo(i32 %0) %coerce.val.ii = trunc i64 %call to i32 ;; ... store to alloca and load back %coerce.val.ii2 = zext i32 %1 to i64 ret i64 %coerce.val.ii2 } Coercion is done with a `trunc` and a `zext`. After optimizations we get the following: define i64 @bar(i32 %x) local_unnamed_addr #0 { entry: %call = tail call i64 @foo(i32 %x) %coerce.val.ii2 = and i64 %call, 4294967295 ret i64 %coerce.val.ii2 } The compiler has to keep semantic of the `zext` instruction, even though no extension or truncation is required in this case. This extra `and` instruction also prevents tail call optimization. In order to keep information about undefined higher bits, the patch replaces `zext` with a sequence of an `insertelement` and a `bitcast`: define i64 @_Z3bari(i32 %x) local_unnamed_addr #0 { entry: %call = tail call i64 @_Z3fooi(i32 %x) #2 %coerce.val.ii = trunc i64 %call to i32 %coerce.val.vec = insertelement <2 x i32> undef, i32 %coerce.val.ii, i8 0 %coerce.val.vec.ii = bitcast <2 x i32> %coerce.val.vec to i64 ret i64 %coerce.val.vec.ii } InstCombiner can then fold this sequence into a nop, and allow tail call optimization. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D100225 Files: clang/lib/CodeGen/CGCall.cpp clang/test/CodeGen/arm64-arguments.c clang/test/CodeGenCXX/trivial_abi.cpp
Index: clang/test/CodeGenCXX/trivial_abi.cpp =================================================================== --- clang/test/CodeGenCXX/trivial_abi.cpp +++ clang/test/CodeGenCXX/trivial_abi.cpp @@ -202,7 +202,8 @@ // CHECK: %[[RETVAL:.*]] = alloca %[[STRUCT_TRIVIAL:.*]], align 4 // CHECK: %[[COERCE_DIVE:.*]] = getelementptr inbounds %[[STRUCT_TRIVIAL]], %[[STRUCT_TRIVIAL]]* %[[RETVAL]], i32 0, i32 0 // CHECK: %[[V0:.*]] = load i32, i32* %[[COERCE_DIVE]], align 4 -// CHECK: %[[COERCE_VAL_II:.*]] = zext i32 %[[V0]] to i64 +// CHECK: %[[COERCE_VAL_VEC:.*]] = insertelement <2 x i32> undef, i32 %[[V0]], i8 0 +// CHECK: %[[COERCE_VAL_II:.*]] = bitcast <2 x i32> %[[COERCE_VAL_VEC]] to i64 // CHECK: ret i64 %[[COERCE_VAL_II]] // CHECK: } Index: clang/test/CodeGen/arm64-arguments.c =================================================================== --- clang/test/CodeGen/arm64-arguments.c +++ clang/test/CodeGen/arm64-arguments.c @@ -745,3 +745,63 @@ // CHECK: call <3 x float> (i32, ...) @test_hva_v3(i32 1, [4 x <4 x float>] {{.*}}) return test_hva_v3(1, *a); } + +char ret_coerce1(void) { + // CHECK-LABEL: i8 @ret_coerce1 + // CHECK: alloca i8 + // CHECK-NEXT: load i8 + // CHECK-NEXT: ret i8 +} + +short ret_coerce2(void) { + // CHECK-LABEL: i16 @ret_coerce2 + // CHECK: alloca i16 + // CHECK-NEXT: load i16 + // CHECK-NEXT: ret i16 +} + +int ret_coerce3(void) { + // CHECK-LABEL: i32 @ret_coerce3 + // CHECK: alloca i32 + // CHECK-NEXT: load i32 + // CHECK-NEXT: ret i32 +} + +struct ret_coerce_char { + char f0; +}; +struct ret_coerce_char ret_coerce4(void) { + // CHECK-LABEL: i64 @ret_coerce4 + // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_char + // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0 + // CHECK: %[[LOAD:.*]] = load i8, i8* %[[GEP]] + // CHECK: %[[VEC:.*]] = insertelement <8 x i8> undef, i8 %[[LOAD]], i8 0 + // CHECK: %[[CAST:.*]] = bitcast <8 x i8> %[[VEC]] to i64 + // CHECK: ret i64 %[[CAST]] +} + +struct ret_coerce_short { + short f0; +}; +struct ret_coerce_short ret_coerce5(void) { + // CHECK-LABEL: i64 @ret_coerce5 + // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_short + // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0 + // CHECK: %[[LOAD:.*]] = load i16, i16* %[[GEP]] + // CHECK: %[[VEC:.*]] = insertelement <4 x i16> undef, i16 %[[LOAD]], i8 0 + // CHECK: %[[CAST:.*]] = bitcast <4 x i16> %[[VEC]] to i64 + // CHECK: ret i64 %[[CAST]] +} + +struct ret_coerce_int { + int f0; +}; +struct ret_coerce_int ret_coerce6(void) { + // CHECK-LABEL: i64 @ret_coerce6 + // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_int + // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0 + // CHECK: %[[LOAD:.*]] = load i32, i32* %[[GEP]] + // CHECK: %[[VEC:.*]] = insertelement <2 x i32> undef, i32 %[[LOAD]], i8 0 + // CHECK: %[[CAST:.*]] = bitcast <2 x i32> %[[VEC]] to i64 + // CHECK: ret i64 %[[CAST]] +} Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1171,8 +1171,8 @@ /// This behaves as if the value were coerced through memory, so on big-endian /// targets the high bits are preserved in a truncation, while little-endian /// targets preserve the low bits. -static llvm::Value *CoerceIntOrPtrToIntOrPtr(llvm::Value *Val, - llvm::Type *Ty, +static llvm::Value *CoerceIntOrPtrToIntOrPtr(llvm::Value *Val, llvm::Type *Ty, + bool UndefUnusedBits, CodeGenFunction &CGF) { if (Val->getType() == Ty) return Val; @@ -1206,8 +1206,26 @@ Val = CGF.Builder.CreateShl(Val, DstSize - SrcSize, "coerce.highbits"); } } else { - // Little-endian targets preserve the low bits. No shifts required. - Val = CGF.Builder.CreateIntCast(Val, DestIntTy, false, "coerce.val.ii"); + llvm::IntegerType *OrigType = cast<llvm::IntegerType>(Val->getType()); + unsigned OrigWidth = OrigType->getBitWidth(); + unsigned DestWidth = cast<llvm::IntegerType>(DestIntTy)->getBitWidth(); + if (UndefUnusedBits && DestWidth > OrigWidth && + DestWidth % OrigWidth == 0) { + // Insert the value in an undef vector, and then bitcast the vector to + // the destination type. Unused vector elements and the corresponding + // bits of the destination value can be treated as undef by + // optimizations. + llvm::VectorType *VecType = + llvm::VectorType::get(OrigType, DestWidth / OrigWidth, + /*Scalable=*/false); + llvm::Value *Vec = CGF.Builder.CreateInsertElement( + llvm::UndefValue::get(VecType), Val, CGF.Builder.getInt8(0), + "coerce.val.vec"); + Val = CGF.Builder.CreateBitCast(Vec, DestIntTy, "coerce.val.vec.ii"); + } else { + // Little-endian targets preserve the low bits. No shifts required. + Val = CGF.Builder.CreateIntCast(Val, DestIntTy, false, "coerce.val.ii"); + } } } @@ -1216,8 +1234,6 @@ return Val; } - - /// CreateCoercedLoad - Create a load from \arg SrcPtr interpreted as /// a pointer to an object of type \arg Ty, known to be aligned to /// \arg SrcAlign bytes. @@ -1226,6 +1242,7 @@ /// destination type; in this situation the values of bits which not /// present in the src are undefined. static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty, + bool UndefUnusedBits, CodeGenFunction &CGF) { llvm::Type *SrcTy = Src.getElementType(); @@ -1248,7 +1265,7 @@ if ((isa<llvm::IntegerType>(Ty) || isa<llvm::PointerType>(Ty)) && (isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy))) { llvm::Value *Load = CGF.Builder.CreateLoad(Src); - return CoerceIntOrPtrToIntOrPtr(Load, Ty, CGF); + return CoerceIntOrPtrToIntOrPtr(Load, Ty, UndefUnusedBits, CGF); } // If load is legal, just bitcast the src pointer. @@ -1314,9 +1331,8 @@ /// /// This safely handles the case when the src type is larger than the /// destination type; the upper bits of the src will be lost. -static void CreateCoercedStore(llvm::Value *Src, - Address Dst, - bool DstIsVolatile, +static void CreateCoercedStore(llvm::Value *Src, Address Dst, + bool DstIsVolatile, bool UndefUnusedBits, CodeGenFunction &CGF) { llvm::Type *SrcTy = Src->getType(); llvm::Type *DstTy = Dst.getElementType(); @@ -1346,7 +1362,7 @@ // extension or truncation to the desired type. if ((isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy)) && (isa<llvm::IntegerType>(DstTy) || isa<llvm::PointerType>(DstTy))) { - Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, CGF); + Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, UndefUnusedBits, CGF); CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); return; } @@ -2873,7 +2889,8 @@ assert(NumIRArgs == 1); auto AI = Fn->getArg(FirstIRArg); AI->setName(Arg->getName() + ".coerce"); - CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, *this); + CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, + /*UndefUnusedBits=*/false, *this); } // Match to what EmitParmDecl is expecting for this type. @@ -3458,7 +3475,8 @@ // If the value is offset in memory, apply the offset now. Address V = emitAddressAtOffset(*this, ReturnValue, RetAI); - RV = CreateCoercedLoad(V, RetAI.getCoerceToType(), *this); + RV = CreateCoercedLoad(V, RetAI.getCoerceToType(), + /*UndefUnusedBits=*/true, *this); } // In ARC, end functions that return a retainable type with a call @@ -4928,8 +4946,8 @@ } else { // In the simple case, just pass the coerced loaded value. assert(NumIRArgs == 1); - llvm::Value *Load = - CreateCoercedLoad(Src, ArgInfo.getCoerceToType(), *this); + llvm::Value *Load = CreateCoercedLoad(Src, ArgInfo.getCoerceToType(), + /*UndefUnusedBits=*/false, *this); if (CallInfo.isCmseNSCall()) { // For certain parameter types, clear padding bits, as they may reveal @@ -5406,7 +5424,8 @@ // If the value is offset in memory, apply the offset now. Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI); - CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this); + CreateCoercedStore(CI, StorePtr, DestIsVolatile, /*UndefUnusedBits=*/true, + *this); return convertTempToRValue(DestPtr, RetTy, SourceLocation()); }
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits