https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/122672
This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap. >From 8760ea5965a338568048293f67faa502e05bbfa7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 10 Jan 2025 21:13:09 +0700 Subject: [PATCH] DAG: Avoid forming shufflevector from a single extract_vector_elt This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +++++++++++++++---- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 ++++---- llvm/test/CodeGen/X86/sse41.ll | 8 +++--- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 712e52ee8fc921..49ad0b526602eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23799,6 +23799,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { SmallVector<SDValue, 8> VecIn; VecIn.push_back(SDValue()); + // If we have a single extract_element with a constant index, track the index + // value. + unsigned OneConstExtractIndex = ~0u; + for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = N->getOperand(i); @@ -23816,16 +23820,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // Not an undef or zero. If the input is something other than an // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. - if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(Op.getOperand(1))) + if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); - SDValue ExtractedFromVec = Op.getOperand(0); + SDValue ExtractedFromVec = Op.getOperand(0); if (ExtractedFromVec.getValueType().isScalableVector()) return SDValue(); + auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!ExtractIdx) + return SDValue(); - const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); - if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) + if (ExtractIdx->getAsAPIntVal().uge( + ExtractedFromVec.getValueType().getVectorNumElements())) return SDValue(); // All inputs must have the same element type as the output. @@ -23833,6 +23839,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); + OneConstExtractIndex = ExtractIdx->getZExtValue(); + // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. @@ -23855,6 +23863,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // VecIn accordingly. bool DidSplitVec = false; if (VecIn.size() == 2) { + // If we only found a single constant indexed extract_vector_elt feeding the + // build_vector, do not produce a more complicated shuffle if the extract is + // cheap. + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && + TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) + return SDValue(); + unsigned MaxIndex = 0; unsigned NearestPow2 = 0; SDValue Vec = VecIn.back(); diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 7912d1cf8dc0d1..add8c0f75bf335 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 2d7258a49f5d09..6dac18c927e114 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -2124,14 +2124,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2152,14 +2152,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits