https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/74275
>From 7baffd6d1f4254b1bd725ddc883a360d79267435 Mon Sep 17 00:00:00 2001 From: James Y Knight <jykni...@google.com> Date: Sat, 2 Dec 2023 23:05:26 -0500 Subject: [PATCH 1/3] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX In late 2021, both Intel and AMD finally documented that every AVX-capable CPU has always been guaranteed to execute aligned 16-byte loads/stores atomically, and further, guaranteed that all future CPUs with AVX will do so as well. Therefore, we may use normal SSE 128-bit load/store instructions to implement atomics, if AVX is enabled. Also adjust handling of unordered atomic load/store in LegalizeIntegerTypes.cpp; currently, it hardcodes a fallback to ATOMIC_CMP_SWAP_WITH_SUCCESS, but we should instead fallback to ATOMIC_LOAD/ATOMIC_STORE. Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity: """ Processors that report [AVX] extend the atomicity for cacheable, naturally-aligned single loads or stores from a quadword to a double quadword. """ Per Intel's SDM: """ Processors that enumerate support for Intel(R) AVX guarantee that the 16-byte memory operations performed by the following instructions will always be carried out atomically: - MOVAPD, MOVAPS, and MOVDQA. - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with EVEX.128 and k0 (masking disabled). """ This was also confirmed to be true for Zhaoxin CPUs with AVX, in https://gcc.gnu.org/PR104688 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 94 +++++-- llvm/test/CodeGen/X86/atomic-non-integer.ll | 24 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 83 +----- llvm/test/CodeGen/X86/atomic128.ll | 247 +++++++++++------- llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll | 8 +- 6 files changed, 256 insertions(+), 228 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 54698edce7d6f8..5b496feee7a8f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3831,17 +3831,14 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo, void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi) { if (N->isAtomic()) { - // It's typical to have larger CAS than atomic load instructions. SDLoc dl(N); EVT VT = N->getMemoryVT(); - SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); - SDValue Zero = DAG.getConstant(0, dl, VT); - SDValue Swap = DAG.getAtomicCmpSwap( - ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, - VT, VTs, N->getOperand(0), - N->getOperand(1), Zero, Zero, N->getMemOperand()); - ReplaceValueWith(SDValue(N, 0), Swap.getValue(0)); - ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); + // We may support larger values in atomic_load than in a normal load + // (without splitting), so switch over if needed. + SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0), + N->getOperand(1), N->getMemOperand()); + ReplaceValueWith(SDValue(N, 0), New.getValue(0)); + ReplaceValueWith(SDValue(N, 1), New.getValue(1)); return; } @@ -5399,14 +5396,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) { SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (N->isAtomic()) { - // It's typical to have larger CAS than atomic store instructions. + // We may support larger values in atomic_store than in a normal store + // (without splitting), so switch over if needed. SDLoc dl(N); - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - N->getMemoryVT(), - N->getOperand(0), N->getOperand(2), - N->getOperand(1), - N->getMemOperand()); - return Swap.getValue(1); + SDValue New = + DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), N->getOperand(0), + N->getOperand(1), N->getOperand(2), N->getMemOperand()); + return New.getValue(0); } if (ISD::isNormalStore(N)) return ExpandOp_NormalStore(N, OpNo); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6167be7bdf84e9..1880cbc3a5bf35 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -515,6 +515,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.is64Bit() && Subtarget.hasAVX()) { + // All CPUs supporting AVX will atomically load/store aligned 128-bit + // values, so we can emit [V]MOVAPS/[V]MOVDQA. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); + } + if (Subtarget.canUseCMPXCHG16B()) setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); @@ -30101,12 +30108,16 @@ TargetLoweringBase::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); - bool NoImplicitFloatOps = - SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); - if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && - (Subtarget.hasSSE1() || Subtarget.hasX87())) - return AtomicExpansionKind::None; + if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && + !Subtarget.useSoftFloat()) { + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + (Subtarget.hasSSE1() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + + if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && + Subtarget.hasAVX()) + return AtomicExpansionKind::None; + } return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand : AtomicExpansionKind::None; @@ -30121,12 +30132,16 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we // can use movq to do the load. If we have X87 we can load into an 80-bit // X87 register and store it to a stack temporary. - bool NoImplicitFloatOps = - LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); - if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && - (Subtarget.hasSSE1() || Subtarget.hasX87())) - return AtomicExpansionKind::None; + if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && + !Subtarget.useSoftFloat()) { + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + (Subtarget.hasSSE1() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + + if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && + Subtarget.hasAVX()) + return AtomicExpansionKind::None; + } return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; @@ -31277,14 +31292,23 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, if (!IsSeqCst && IsTypeLegal) return Op; - if (VT == MVT::i64 && !IsTypeLegal) { + if (!IsTypeLegal && !Subtarget.useSoftFloat() && + !DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat)) { + SDValue Chain; + // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a + // vector store. + if (VT == MVT::i128) { + if (Subtarget.is64Bit() && Subtarget.hasAVX()) { + SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal()); + Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(), + Node->getMemOperand()); + } + } + // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE // is enabled. - bool NoImplicitFloatOps = - DAG.getMachineFunction().getFunction().hasFnAttribute( - Attribute::NoImplicitFloat); - if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { - SDValue Chain; + if (VT == MVT::i64) { if (Subtarget.hasSSE1()) { SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal()); @@ -31316,15 +31340,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, Node->getMemOperand()); } + } - if (Chain) { - // If this is a sequentially consistent store, also emit an appropriate - // barrier. - if (IsSeqCst) - Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); + if (Chain) { + // If this is a sequentially consistent store, also emit an appropriate + // barrier. + if (IsSeqCst) + Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); - return Chain; - } + return Chain; } } @@ -32877,12 +32901,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ATOMIC_LOAD: { - assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + assert( + (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) && + "Unexpected VT!"); bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { auto *Node = cast<AtomicSDNode>(N); + + if (N->getValueType(0) == MVT::i128) { + if (Subtarget.is64Bit() && Subtarget.hasAVX()) { + SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(), + Node->getBasePtr(), Node->getMemOperand()); + SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(1, dl)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), + {ResL, ResH})); + Results.push_back(Ld.getValue(1)); + return; + } + break; + } if (Subtarget.hasSSE1()) { // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS. // Then extract the lower 64-bits. diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index 7d2810e57a25b5..22b45b13aae227 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -207,14 +207,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) { ; ; X64-AVX-LABEL: store_fp128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 32 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rsi -; X64-AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-AVX-NEXT: callq __sync_lock_test_and_set_16@PLT -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq store atomic fp128 %v, ptr %fptr unordered, align 16 ret void @@ -592,18 +585,9 @@ define fp128 @load_fp128(ptr %fptr) { ; ; X64-AVX-LABEL: load_fp128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 32 -; X64-AVX-NEXT: xorl %esi, %esi -; X64-AVX-NEXT: xorl %edx, %edx -; X64-AVX-NEXT: xorl %ecx, %ecx -; X64-AVX-NEXT: xorl %r8d, %r8d -; X64-AVX-NEXT: callq __sync_val_compare_and_swap_16@PLT -; X64-AVX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, (%rsp) -; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-AVX-NEXT: retq %v = load atomic fp128, ptr %fptr unordered, align 16 ret fp128 %v diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index b66988c8bd24b5..91e427189de477 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -230,34 +230,12 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) { } define i128 @load_i128(ptr %ptr) { -; CHECK-O0-LABEL: load_i128: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: pushq %rbx -; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O0-NEXT: .cfi_offset %rbx, -16 -; CHECK-O0-NEXT: xorl %eax, %eax -; CHECK-O0-NEXT: movl %eax, %ebx -; CHECK-O0-NEXT: movq %rbx, %rax -; CHECK-O0-NEXT: movq %rbx, %rdx -; CHECK-O0-NEXT: movq %rbx, %rcx -; CHECK-O0-NEXT: lock cmpxchg16b (%rdi) -; CHECK-O0-NEXT: popq %rbx -; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O0-NEXT: retq -; -; CHECK-O3-LABEL: load_i128: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: pushq %rbx -; CHECK-O3-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O3-NEXT: .cfi_offset %rbx, -16 -; CHECK-O3-NEXT: xorl %eax, %eax -; CHECK-O3-NEXT: xorl %edx, %edx -; CHECK-O3-NEXT: xorl %ecx, %ecx -; CHECK-O3-NEXT: xorl %ebx, %ebx -; CHECK-O3-NEXT: lock cmpxchg16b (%rdi) -; CHECK-O3-NEXT: popq %rbx -; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O3-NEXT: retq +; CHECK-LABEL: load_i128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-NEXT: retq %v = load atomic i128, ptr %ptr unordered, align 16 ret i128 %v } @@ -265,51 +243,18 @@ define i128 @load_i128(ptr %ptr) { define void @store_i128(ptr %ptr, i128 %v) { ; CHECK-O0-LABEL: store_i128: ; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: pushq %rbx -; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O0-NEXT: .cfi_offset %rbx, -16 -; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq (%rdi), %rax -; CHECK-O0-NEXT: movq 8(%rdi), %rdx -; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: jmp .LBB16_1 -; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start -; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-O0-NEXT: lock cmpxchg16b (%rsi) -; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-O0-NEXT: jne .LBB16_1 -; CHECK-O0-NEXT: jmp .LBB16_2 -; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end -; CHECK-O0-NEXT: popq %rbx -; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O0-NEXT: vmovq %rsi, %xmm0 +; CHECK-O0-NEXT: vmovq %rdx, %xmm1 +; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: store_i128: ; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: pushq %rbx -; CHECK-O3-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O3-NEXT: .cfi_offset %rbx, -16 -; CHECK-O3-NEXT: movq %rdx, %rcx -; CHECK-O3-NEXT: movq %rsi, %rbx -; CHECK-O3-NEXT: movq (%rdi), %rax -; CHECK-O3-NEXT: movq 8(%rdi), %rdx -; CHECK-O3-NEXT: .p2align 4, 0x90 -; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start -; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-O3-NEXT: lock cmpxchg16b (%rdi) -; CHECK-O3-NEXT: jne .LBB16_1 -; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-O3-NEXT: popq %rbx -; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O3-NEXT: vmovq %rdx, %xmm0 +; CHECK-O3-NEXT: vmovq %rsi, %xmm1 +; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O3-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll index d5600b54a169d2..76c3b2c5f1bb13 100644 --- a/llvm/test/CodeGen/X86/atomic128.ll +++ b/llvm/test/CodeGen/X86/atomic128.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16,avx | FileCheck %s --check-prefixes=CHECK,CHECK-AVX ; RUN: llc < %s -mtriple=i386-linux-gnu -verify-machineinstrs -mattr=cx16 | FileCheck %s -check-prefixes=CHECK32 ; RUN: llc < %s -mtriple=i386-linux-gnu -verify-machineinstrs -mattr=-cx16 | FileCheck %s -check-prefixes=CHECK32 @@ -83,21 +84,32 @@ define i128 @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) { @cmpxchg16b_global = external dso_local global { i128, i128 }, align 16 ;; Make sure we retain the offset of the global variable. -define void @cmpxchg16b_global_with_offset() nounwind { -; CHECK-LABEL: cmpxchg16b_global_with_offset: -; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +define i128 @load_global_with_offset() nounwind { +; CHECK-NOAVX-LABEL: load_global_with_offset: +; CHECK-NOAVX: ## %bb.0: ## %entry +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: xorl %eax, %eax +; CHECK-NOAVX-NEXT: xorl %edx, %edx +; CHECK-NOAVX-NEXT: xorl %ecx, %ecx +; CHECK-NOAVX-NEXT: xorl %ebx, %ebx +; CHECK-NOAVX-NEXT: lock cmpxchg16b _cmpxchg16b_global+16(%rip) +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq ; -; CHECK32-LABEL: cmpxchg16b_global_with_offset: +; CHECK-AVX-LABEL: load_global_with_offset: +; CHECK-AVX: ## %bb.0: ## %entry +; CHECK-AVX-NEXT: vmovdqa _cmpxchg16b_global+16(%rip), %xmm0 +; CHECK-AVX-NEXT: vmovq %xmm0, %rax +; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-AVX-NEXT: retq +; +; CHECK32-LABEL: load_global_with_offset: ; CHECK32: # %bb.0: # %entry -; CHECK32-NEXT: subl $36, %esp +; CHECK32-NEXT: pushl %edi +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: subl $20, %esp +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: subl $8, %esp ; CHECK32-NEXT: leal {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: pushl $0 ; CHECK32-NEXT: pushl $0 @@ -110,11 +122,23 @@ define void @cmpxchg16b_global_with_offset() nounwind { ; CHECK32-NEXT: pushl $cmpxchg16b_global+16 ; CHECK32-NEXT: pushl %eax ; CHECK32-NEXT: calll __sync_val_compare_and_swap_16 -; CHECK32-NEXT: addl $72, %esp -; CHECK32-NEXT: retl +; CHECK32-NEXT: addl $44, %esp +; CHECK32-NEXT: movl (%esp), %eax +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK32-NEXT: movl %edi, 8(%esi) +; CHECK32-NEXT: movl %edx, 12(%esi) +; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %esi, %eax +; CHECK32-NEXT: addl $20, %esp +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: popl %edi +; CHECK32-NEXT: retl $4 entry: %0 = load atomic i128, ptr getelementptr inbounds ({i128, i128}, ptr @cmpxchg16b_global, i64 0, i32 1) acquire, align 16 - ret void + ret i128 %0 } define void @fetch_and_nand(ptr %p, i128 %bits) { @@ -676,18 +700,25 @@ define void @fetch_and_umax(ptr %p, i128 %bits) { } define i128 @atomic_load_seq_cst(ptr %p) { -; CHECK-LABEL: atomic_load_seq_cst: -; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: lock cmpxchg16b (%rdi) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NOAVX-LABEL: atomic_load_seq_cst: +; CHECK-NOAVX: ## %bb.0: +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16 +; CHECK-NOAVX-NEXT: xorl %eax, %eax +; CHECK-NOAVX-NEXT: xorl %edx, %edx +; CHECK-NOAVX-NEXT: xorl %ecx, %ecx +; CHECK-NOAVX-NEXT: xorl %ebx, %ebx +; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq +; +; CHECK-AVX-LABEL: atomic_load_seq_cst: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-AVX-NEXT: vmovq %xmm0, %rax +; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-AVX-NEXT: retq ; ; CHECK32-LABEL: atomic_load_seq_cst: ; CHECK32: # %bb.0: @@ -748,18 +779,25 @@ define i128 @atomic_load_seq_cst(ptr %p) { } define i128 @atomic_load_relaxed(ptr %p) { -; CHECK-LABEL: atomic_load_relaxed: -; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: lock cmpxchg16b (%rdi) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NOAVX-LABEL: atomic_load_relaxed: +; CHECK-NOAVX: ## %bb.0: +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16 +; CHECK-NOAVX-NEXT: xorl %eax, %eax +; CHECK-NOAVX-NEXT: xorl %edx, %edx +; CHECK-NOAVX-NEXT: xorl %ecx, %ecx +; CHECK-NOAVX-NEXT: xorl %ebx, %ebx +; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq +; +; CHECK-AVX-LABEL: atomic_load_relaxed: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-AVX-NEXT: vmovq %xmm0, %rax +; CHECK-AVX-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-AVX-NEXT: retq ; ; CHECK32-LABEL: atomic_load_relaxed: ; CHECK32: # %bb.0: @@ -820,23 +858,32 @@ define i128 @atomic_load_relaxed(ptr %p) { } define void @atomic_store_seq_cst(ptr %p, i128 %in) { -; CHECK-LABEL: atomic_store_seq_cst: -; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB12_1: ## %atomicrmw.start -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lock cmpxchg16b (%rdi) -; CHECK-NEXT: jne LBB12_1 -; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NOAVX-LABEL: atomic_store_seq_cst: +; CHECK-NOAVX: ## %bb.0: +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16 +; CHECK-NOAVX-NEXT: movq %rdx, %rcx +; CHECK-NOAVX-NEXT: movq %rsi, %rbx +; CHECK-NOAVX-NEXT: movq (%rdi), %rax +; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx +; CHECK-NOAVX-NEXT: .p2align 4, 0x90 +; CHECK-NOAVX-NEXT: LBB12_1: ## %atomicrmw.start +; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NOAVX-NEXT: jne LBB12_1 +; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq +; +; CHECK-AVX-LABEL: atomic_store_seq_cst: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vmovq %rdx, %xmm0 +; CHECK-AVX-NEXT: vmovq %rsi, %xmm1 +; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-AVX-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-AVX-NEXT: retq ; ; CHECK32-LABEL: atomic_store_seq_cst: ; CHECK32: # %bb.0: @@ -865,23 +912,31 @@ define void @atomic_store_seq_cst(ptr %p, i128 %in) { } define void @atomic_store_release(ptr %p, i128 %in) { -; CHECK-LABEL: atomic_store_release: -; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB13_1: ## %atomicrmw.start -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lock cmpxchg16b (%rdi) -; CHECK-NEXT: jne LBB13_1 -; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NOAVX-LABEL: atomic_store_release: +; CHECK-NOAVX: ## %bb.0: +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16 +; CHECK-NOAVX-NEXT: movq %rdx, %rcx +; CHECK-NOAVX-NEXT: movq %rsi, %rbx +; CHECK-NOAVX-NEXT: movq (%rdi), %rax +; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx +; CHECK-NOAVX-NEXT: .p2align 4, 0x90 +; CHECK-NOAVX-NEXT: LBB13_1: ## %atomicrmw.start +; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NOAVX-NEXT: jne LBB13_1 +; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq +; +; CHECK-AVX-LABEL: atomic_store_release: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vmovq %rdx, %xmm0 +; CHECK-AVX-NEXT: vmovq %rsi, %xmm1 +; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-AVX-NEXT: retq ; ; CHECK32-LABEL: atomic_store_release: ; CHECK32: # %bb.0: @@ -910,23 +965,31 @@ define void @atomic_store_release(ptr %p, i128 %in) { } define void @atomic_store_relaxed(ptr %p, i128 %in) { -; CHECK-LABEL: atomic_store_relaxed: -; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB14_1: ## %atomicrmw.start -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lock cmpxchg16b (%rdi) -; CHECK-NEXT: jne LBB14_1 -; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NOAVX-LABEL: atomic_store_relaxed: +; CHECK-NOAVX: ## %bb.0: +; CHECK-NOAVX-NEXT: pushq %rbx +; CHECK-NOAVX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NOAVX-NEXT: .cfi_offset %rbx, -16 +; CHECK-NOAVX-NEXT: movq %rdx, %rcx +; CHECK-NOAVX-NEXT: movq %rsi, %rbx +; CHECK-NOAVX-NEXT: movq (%rdi), %rax +; CHECK-NOAVX-NEXT: movq 8(%rdi), %rdx +; CHECK-NOAVX-NEXT: .p2align 4, 0x90 +; CHECK-NOAVX-NEXT: LBB14_1: ## %atomicrmw.start +; CHECK-NOAVX-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NOAVX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NOAVX-NEXT: jne LBB14_1 +; CHECK-NOAVX-NEXT: ## %bb.2: ## %atomicrmw.end +; CHECK-NOAVX-NEXT: popq %rbx +; CHECK-NOAVX-NEXT: retq +; +; CHECK-AVX-LABEL: atomic_store_relaxed: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vmovq %rdx, %xmm0 +; CHECK-AVX-NEXT: vmovq %rsi, %xmm1 +; CHECK-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-AVX-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-AVX-NEXT: retq ; ; CHECK32-LABEL: atomic_store_relaxed: ; CHECK32: # %bb.0: diff --git a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll index 57594870a54a85..3fb561d00f97d1 100644 --- a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll +++ b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll @@ -110,11 +110,9 @@ define i128 @cmpxchg_use_eflags_and_val(ptr %addr, i128 %offset) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset %rbx, -16 ; CHECK-NEXT: movq %rdx, %r8 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: lock cmpxchg16b (%rdi) +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB4_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 >From db3abe6b07517fbafc480a99f72fa62e9e4c9234 Mon Sep 17 00:00:00 2001 From: James Y Knight <jykni...@google.com> Date: Mon, 4 Dec 2023 12:30:47 -0500 Subject: [PATCH 2/3] Revert change to LegalizeIntegerTypes.cpp --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +++--- llvm/test/CodeGen/X86/atomic-unordered.ll | 94 +++++++++++++++---- 2 files changed, 90 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 5b496feee7a8f4..54698edce7d6f8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3831,14 +3831,17 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo, void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi) { if (N->isAtomic()) { + // It's typical to have larger CAS than atomic load instructions. SDLoc dl(N); EVT VT = N->getMemoryVT(); - // We may support larger values in atomic_load than in a normal load - // (without splitting), so switch over if needed. - SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0), - N->getOperand(1), N->getMemOperand()); - ReplaceValueWith(SDValue(N, 0), New.getValue(0)); - ReplaceValueWith(SDValue(N, 1), New.getValue(1)); + SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Swap = DAG.getAtomicCmpSwap( + ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, + VT, VTs, N->getOperand(0), + N->getOperand(1), Zero, Zero, N->getMemOperand()); + ReplaceValueWith(SDValue(N, 0), Swap.getValue(0)); + ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); return; } @@ -5396,13 +5399,14 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) { SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (N->isAtomic()) { - // We may support larger values in atomic_store than in a normal store - // (without splitting), so switch over if needed. + // It's typical to have larger CAS than atomic store instructions. SDLoc dl(N); - SDValue New = - DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), N->getOperand(0), - N->getOperand(1), N->getOperand(2), N->getMemOperand()); - return New.getValue(0); + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + N->getMemoryVT(), + N->getOperand(0), N->getOperand(2), + N->getOperand(1), + N->getMemOperand()); + return Swap.getValue(1); } if (ISD::isNormalStore(N)) return ExpandOp_NormalStore(N, OpNo); diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 91e427189de477..f93cbe6aa9a91c 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -230,32 +230,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) { } define i128 @load_i128(ptr %ptr) { -; CHECK-LABEL: load_i128: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: vpextrq $1, %xmm0, %rdx -; CHECK-NEXT: retq +; CHECK-O0-CUR-LABEL: load_i128: +; CHECK-O0-CUR: # %bb.0: +; CHECK-O0-CUR-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-O0-CUR-NEXT: vmovq %xmm0, %rax +; CHECK-O0-CUR-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-O0-CUR-NEXT: retq +; +; CHECK-O3-CUR-LABEL: load_i128: +; CHECK-O3-CUR: # %bb.0: +; CHECK-O3-CUR-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-O3-CUR-NEXT: vmovq %xmm0, %rax +; CHECK-O3-CUR-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-O3-CUR-NEXT: retq +; +; CHECK-O0-EX-LABEL: load_i128: +; CHECK-O0-EX: # %bb.0: +; CHECK-O0-EX-NEXT: pushq %rbx +; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-O0-EX-NEXT: .cfi_offset %rbx, -16 +; CHECK-O0-EX-NEXT: xorl %eax, %eax +; CHECK-O0-EX-NEXT: movl %eax, %ebx +; CHECK-O0-EX-NEXT: movq %rbx, %rax +; CHECK-O0-EX-NEXT: movq %rbx, %rdx +; CHECK-O0-EX-NEXT: movq %rbx, %rcx +; CHECK-O0-EX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-O0-EX-NEXT: popq %rbx +; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O0-EX-NEXT: retq +; +; CHECK-O3-EX-LABEL: load_i128: +; CHECK-O3-EX: # %bb.0: +; CHECK-O3-EX-NEXT: pushq %rbx +; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-O3-EX-NEXT: .cfi_offset %rbx, -16 +; CHECK-O3-EX-NEXT: xorl %eax, %eax +; CHECK-O3-EX-NEXT: xorl %edx, %edx +; CHECK-O3-EX-NEXT: xorl %ecx, %ecx +; CHECK-O3-EX-NEXT: xorl %ebx, %ebx +; CHECK-O3-EX-NEXT: lock cmpxchg16b (%rdi) +; CHECK-O3-EX-NEXT: popq %rbx +; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O3-EX-NEXT: retq %v = load atomic i128, ptr %ptr unordered, align 16 ret i128 %v } define void @store_i128(ptr %ptr, i128 %v) { -; CHECK-O0-LABEL: store_i128: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: vmovq %rsi, %xmm0 -; CHECK-O0-NEXT: vmovq %rdx, %xmm1 -; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi) -; CHECK-O0-NEXT: retq +; CHECK-O0-CUR-LABEL: store_i128: +; CHECK-O0-CUR: # %bb.0: +; CHECK-O0-CUR-NEXT: vmovq %rsi, %xmm0 +; CHECK-O0-CUR-NEXT: vmovq %rdx, %xmm1 +; CHECK-O0-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-O0-CUR-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-O0-CUR-NEXT: retq ; -; CHECK-O3-LABEL: store_i128: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: vmovq %rdx, %xmm0 -; CHECK-O3-NEXT: vmovq %rsi, %xmm1 -; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) -; CHECK-O3-NEXT: retq +; CHECK-O3-CUR-LABEL: store_i128: +; CHECK-O3-CUR: # %bb.0: +; CHECK-O3-CUR-NEXT: vmovq %rdx, %xmm0 +; CHECK-O3-CUR-NEXT: vmovq %rsi, %xmm1 +; CHECK-O3-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-O3-CUR-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-O3-CUR-NEXT: retq +; +; CHECK-O0-EX-LABEL: store_i128: +; CHECK-O0-EX: # %bb.0: +; CHECK-O0-EX-NEXT: pushq %rax +; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-O0-EX-NEXT: callq __sync_lock_test_and_set_16@PLT +; CHECK-O0-EX-NEXT: popq %rax +; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O0-EX-NEXT: retq +; +; CHECK-O3-EX-LABEL: store_i128: +; CHECK-O3-EX: # %bb.0: +; CHECK-O3-EX-NEXT: pushq %rax +; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-O3-EX-NEXT: callq __sync_lock_test_and_set_16@PLT +; CHECK-O3-EX-NEXT: popq %rax +; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 +; CHECK-O3-EX-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } >From 0632fc2072806b5dce6548c3b0beb3e4d8ae6cfa Mon Sep 17 00:00:00 2001 From: James Y Knight <jykni...@google.com> Date: Sat, 16 Dec 2023 10:36:28 -0500 Subject: [PATCH 3/3] Address review comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++--- llvm/test/CodeGen/X86/atomic-unordered.ll | 96 +++++------------------ 2 files changed, 29 insertions(+), 85 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e72e5a25a9389..6e8ccab20f12b0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30136,20 +30136,20 @@ X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { } // Note: this turns large loads into lock cmpxchg8b/16b. -// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { Type *MemType = LI->getType(); - // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we - // can use movq to do the load. If we have X87 we can load into an 80-bit - // X87 register and store it to a stack temporary. if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && !Subtarget.useSoftFloat()) { + // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we + // can use movq to do the load. If we have X87 we can load into an 80-bit + // X87 register and store it to a stack temporary. if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && (Subtarget.hasSSE1() || Subtarget.hasX87())) return AtomicExpansionKind::None; + // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic. if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && Subtarget.hasAVX()) return AtomicExpansionKind::None; @@ -31298,12 +31298,10 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, SDValue Chain; // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a // vector store. - if (VT == MVT::i128) { - if (Subtarget.is64Bit() && Subtarget.hasAVX()) { - SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal()); - Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(), - Node->getMemOperand()); - } + if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) { + SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal()); + Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(), + Node->getMemOperand()); } // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index bc8f77a654c045..3fb994cdb751a3 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -228,86 +228,32 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) { } define i128 @load_i128(ptr %ptr) { -; CHECK-O0-CUR-LABEL: load_i128: -; CHECK-O0-CUR: # %bb.0: -; CHECK-O0-CUR-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-O0-CUR-NEXT: vmovq %xmm0, %rax -; CHECK-O0-CUR-NEXT: vpextrq $1, %xmm0, %rdx -; CHECK-O0-CUR-NEXT: retq -; -; CHECK-O3-CUR-LABEL: load_i128: -; CHECK-O3-CUR: # %bb.0: -; CHECK-O3-CUR-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-O3-CUR-NEXT: vmovq %xmm0, %rax -; CHECK-O3-CUR-NEXT: vpextrq $1, %xmm0, %rdx -; CHECK-O3-CUR-NEXT: retq -; -; CHECK-O0-EX-LABEL: load_i128: -; CHECK-O0-EX: # %bb.0: -; CHECK-O0-EX-NEXT: pushq %rbx -; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O0-EX-NEXT: .cfi_offset %rbx, -16 -; CHECK-O0-EX-NEXT: xorl %eax, %eax -; CHECK-O0-EX-NEXT: movl %eax, %ebx -; CHECK-O0-EX-NEXT: movq %rbx, %rax -; CHECK-O0-EX-NEXT: movq %rbx, %rdx -; CHECK-O0-EX-NEXT: movq %rbx, %rcx -; CHECK-O0-EX-NEXT: lock cmpxchg16b (%rdi) -; CHECK-O0-EX-NEXT: popq %rbx -; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O0-EX-NEXT: retq -; -; CHECK-O3-EX-LABEL: load_i128: -; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: pushq %rbx -; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O3-EX-NEXT: .cfi_offset %rbx, -16 -; CHECK-O3-EX-NEXT: xorl %eax, %eax -; CHECK-O3-EX-NEXT: xorl %edx, %edx -; CHECK-O3-EX-NEXT: xorl %ecx, %ecx -; CHECK-O3-EX-NEXT: xorl %ebx, %ebx -; CHECK-O3-EX-NEXT: lock cmpxchg16b (%rdi) -; CHECK-O3-EX-NEXT: popq %rbx -; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O3-EX-NEXT: retq +; CHECK-LABEL: load_i128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: vpextrq $1, %xmm0, %rdx +; CHECK-NEXT: retq %v = load atomic i128, ptr %ptr unordered, align 16 ret i128 %v } define void @store_i128(ptr %ptr, i128 %v) { -; CHECK-O0-CUR-LABEL: store_i128: -; CHECK-O0-CUR: # %bb.0: -; CHECK-O0-CUR-NEXT: vmovq %rsi, %xmm0 -; CHECK-O0-CUR-NEXT: vmovq %rdx, %xmm1 -; CHECK-O0-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-O0-CUR-NEXT: vmovdqa %xmm0, (%rdi) -; CHECK-O0-CUR-NEXT: retq -; -; CHECK-O3-CUR-LABEL: store_i128: -; CHECK-O3-CUR: # %bb.0: -; CHECK-O3-CUR-NEXT: vmovq %rdx, %xmm0 -; CHECK-O3-CUR-NEXT: vmovq %rsi, %xmm1 -; CHECK-O3-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-O3-CUR-NEXT: vmovdqa %xmm0, (%rdi) -; CHECK-O3-CUR-NEXT: retq -; -; CHECK-O0-EX-LABEL: store_i128: -; CHECK-O0-EX: # %bb.0: -; CHECK-O0-EX-NEXT: pushq %rax -; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O0-EX-NEXT: callq __sync_lock_test_and_set_16@PLT -; CHECK-O0-EX-NEXT: popq %rax -; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O0-EX-NEXT: retq -; -; CHECK-O3-EX-LABEL: store_i128: -; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: pushq %rax -; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 -; CHECK-O3-EX-NEXT: callq __sync_lock_test_and_set_16@PLT -; CHECK-O3-EX-NEXT: popq %rax -; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-O3-EX-NEXT: retq +; CHECK-O0-LABEL: store_i128: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: vmovq %rsi, %xmm0 +; CHECK-O0-NEXT: vmovq %rdx, %xmm1 +; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: store_i128: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: vmovq %rdx, %xmm0 +; CHECK-O3-NEXT: vmovq %rsi, %xmm1 +; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-O3-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits