Author: sampo Date: Sun Feb 10 22:19:36 2008 New Revision: 46949 URL: http://llvm.org/viewvc/llvm-project?rev=46949&view=rev Log: Enable SSE4 codegen and pattern matching. Add some notes to the README.
Modified: llvm/trunk/lib/Target/TargetSelectionDAG.td llvm/trunk/lib/Target/X86/README-SSE.txt llvm/trunk/lib/Target/X86/X86ISelLowering.cpp llvm/trunk/lib/Target/X86/X86ISelLowering.h llvm/trunk/lib/Target/X86/X86InstrSSE.td Modified: llvm/trunk/lib/Target/TargetSelectionDAG.td URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/TargetSelectionDAG.td?rev=46949&r1=46948&r2=46949&view=diff ============================================================================== --- llvm/trunk/lib/Target/TargetSelectionDAG.td (original) +++ llvm/trunk/lib/Target/TargetSelectionDAG.td Sun Feb 10 22:19:36 2008 @@ -181,8 +181,8 @@ def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2> ]>; -def SDTVecInsert : SDTypeProfile<1, 2, [ // vector insert - SDTCisEltOfVec<1, 0>, SDTCisPtrTy<2> +def SDTVecInsert : SDTypeProfile<1, 3, [ // vector insert + SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3> ]>; class SDCallSeqStart<list<SDTypeConstraint> constraints> : Modified: llvm/trunk/lib/Target/X86/README-SSE.txt URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/README-SSE.txt?rev=46949&r1=46948&r2=46949&view=diff ============================================================================== --- llvm/trunk/lib/Target/X86/README-SSE.txt (original) +++ llvm/trunk/lib/Target/X86/README-SSE.txt Sun Feb 10 22:19:36 2008 @@ -761,3 +761,23 @@ instead of in target independent code. //===---------------------------------------------------------------------===// + +Non-SSE4 insert into 16 x i8 is atrociously bad. + +//===---------------------------------------------------------------------===// + +<2 x i64> extract is substantially worse than <2 x f64>, even if the destination +is memory. + +//===---------------------------------------------------------------------===// + +SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext +sitting between the truncate and the extract. + +//===---------------------------------------------------------------------===// + +INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert +any number of 0.0 simultaneously. Currently we only use it for simple +insertions. + +See comments in LowerINSERT_VECTOR_ELT_SSE4. Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=46949&r1=46948&r2=46949&view=diff ============================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original) +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Feb 10 22:19:36 2008 @@ -678,6 +678,33 @@ setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); } + + if (Subtarget->hasSSE41()) { + // FIXME: Do we need to handle scalar-to-vector here? + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + + // i8 and i16 vectors are custom , because the source register and source + // source memory operand types are not the same width. f32 vectors are + // custom since the immediate controlling the insert encodes additional + // information. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + } + } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -3655,10 +3682,34 @@ } SDOperand +X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, + SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + if (MVT::getSizeInBits(VT) == 8) { + SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } else if (MVT::getSizeInBits(VT) == 16) { + SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } + return SDOperand(); +} + + +SDOperand X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDOperand(); + if (Subtarget->hasSSE41()) + return LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + MVT::ValueType VT = Op.getValueType(); // TODO: handle v16i8. if (MVT::getSizeInBits(VT) == 16) { @@ -3699,6 +3750,9 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, DAG.getIntPtrConstant(0)); } else if (MVT::getSizeInBits(VT) == 64) { + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b + // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught + // to match extract_elt for f64. unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); if (Idx == 0) return Op; @@ -3724,9 +3778,47 @@ } SDOperand +X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG){ + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + + if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == 16)) { + unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB + : X86ISD::PINSRW; + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue()); + return DAG.getNode(Opc, VT, N0, N1, N2); + } else if (EVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into these + // bits. For example (insert (extract, 3), 2) could be matched by putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() << 4); + return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2); + } + return SDOperand(); +} + +SDOperand X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { MVT::ValueType VT = Op.getValueType(); MVT::ValueType EVT = MVT::getVectorElementType(VT); + + if (Subtarget->hasSSE41()) + return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + if (EVT == MVT::i8) return SDOperand(); @@ -5273,7 +5365,10 @@ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::S2VEC: return "X86ISD::S2VEC"; + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; + case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=46949&r1=46948&r2=46949&view=diff ============================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original) +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Sun Feb 10 22:19:36 2008 @@ -170,10 +170,22 @@ /// have to match the operand type. S2VEC, + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, + /// INSERTPS - Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, @@ -493,7 +505,9 @@ SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG); SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG); SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG); SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG); SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG); SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG); SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG); Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=46949&r1=46948&r2=46949&view=diff ============================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original) +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Sun Feb 10 22:19:36 2008 @@ -35,8 +35,19 @@ def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; -def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>; -def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insrtps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>; //===----------------------------------------------------------------------===// // SSE 'Special' Instructions @@ -2087,23 +2098,21 @@ (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - (iPTR imm:$src2)))]>; + imm:$src2))]>; let isTwoAddress = 1 in { def PINSRWrri : PDIi8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (v8i16 (X86pinsrw (v8i16 VR128:$src1), - GR32:$src2, (iPTR imm:$src3))))]>; + (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>; def PINSRWrmi : PDIi8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, - (v8i16 (X86pinsrw (v8i16 VR128:$src1), - (i32 (anyext (loadi16 addr:$src2))), - (iPTR imm:$src3))))]>; + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))]>; } // Mask creation @@ -3255,7 +3264,7 @@ /// SS41I_binop_rmi_int - SSE 4.1 binary operator with immediate -let isTwoAddress = 1 in { +let Uses = [XMM0], isTwoAddress = 1 in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -3328,26 +3337,44 @@ defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>; -/// SS41I_binop_ext8 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_ext8<bits<8> opc, string OpcodeStr> { +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set GR32:$dst, (zext - (extractelt (v16i8 VR128:$src1), imm:$src2)))]>, OpSize; + [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, + OpSize; def mr : SS4AI<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (extractelt (v16i8 VR128:$src1), imm:$src2), - addr:$dst)]>, OpSize; + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + def mr : SS4AI<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -defm PEXTRB : SS41I_binop_ext8<0x14, "pextrb">; +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; + -/// SS41I_binop_ext32 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> { +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -3362,10 +3389,11 @@ addr:$dst)]>, OpSize; } -defm PEXTRD : SS41I_binop_ext32<0x16, "pextrd">; +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; -/// SS41I_binop_extf32 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> { + +/// SS41I_extractf32 - SSE 4.1 extract 32 bits to fp reg or memory destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -3380,5 +3408,65 @@ addr:$dst)]>, OpSize; } -defm EXTRACTPS : SS41I_binop_extf32<0x17, "extractps">; +defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; + +let isTwoAddress = 1 in { + multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, OpSize; + } +} + +defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +let isTwoAddress = 1 in { + multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, OpSize; + } +} + +defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +let isTwoAddress = 1 in { + multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, FR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86insrtps VR128:$src1, (loadf32 addr:$src2), + imm:$src3))]>, OpSize; + } +} +defm INSERTPS : SS41I_insertf32<0x31, "insertps">; _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits