[llvm-commits] CVS: llvm/docs/WritingAnLLVMBackend.html
Changes in directory llvm/docs: WritingAnLLVMBackend.html updated: 1.10 -> 1.11 --- Log message: Suggest looking at the SPARC backend. How weird is that?? :) --- Diffs of the changes: (+4 -3) WritingAnLLVMBackend.html |7 --- 1 files changed, 4 insertions(+), 3 deletions(-) Index: llvm/docs/WritingAnLLVMBackend.html diff -u llvm/docs/WritingAnLLVMBackend.html:1.10 llvm/docs/WritingAnLLVMBackend.html:1.11 --- llvm/docs/WritingAnLLVMBackend.html:1.10Mon Mar 13 23:39:39 2006 +++ llvm/docs/WritingAnLLVMBackend.html Mon Apr 24 11:34:45 2006 @@ -61,8 +61,9 @@ -In general, you want to follow the format of X86 or PowerPC (in -lib/Target). +In general, you want to follow the format of SPARC, X86 or PowerPC (in +lib/Target). SPARC is the simplest backend, and is RISC, so if +you're working on a RISC target, it is a good one to start with. To create a static compiler (one that emits text assembly), you need to implement the following: @@ -252,7 +253,7 @@ http://misha.brukman.net";>Misha Brukman http://llvm.org";>The LLVM Compiler Infrastructure - Last modified: $Date: 2006/03/14 05:39:39 $ + Last modified: $Date: 2006/04/24 16:34:45 $ ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/lib/Target/X86/README.txt
Changes in directory llvm/lib/Target/X86: README.txt updated: 1.96 -> 1.97 --- Log message: Remove a completed entry. --- Diffs of the changes: (+0 -55) README.txt | 55 --- 1 files changed, 55 deletions(-) Index: llvm/lib/Target/X86/README.txt diff -u llvm/lib/Target/X86/README.txt:1.96 llvm/lib/Target/X86/README.txt:1.97 --- llvm/lib/Target/X86/README.txt:1.96 Sun Apr 23 14:47:09 2006 +++ llvm/lib/Target/X86/README.txt Mon Apr 24 12:38:16 2006 @@ -999,61 +999,6 @@ //===-===// -Use the 0's in the top part of movss from memory (and from other instructions -that generate them) to build vectors more efficiently. Consider: - -vector float test(float a) { - return (vector float){ 0.0, a, 0.0, 0.0}; -} - -We currently generate this as: - -_test: -sub %ESP, 28 -movss %XMM0, DWORD PTR [%ESP + 32] -movss DWORD PTR [%ESP + 4], %XMM0 -mov DWORD PTR [%ESP + 12], 0 -mov DWORD PTR [%ESP + 8], 0 -mov DWORD PTR [%ESP], 0 -movaps %XMM0, XMMWORD PTR [%ESP] -add %ESP, 28 -ret - -Something like this should be sufficient: - -_test: - movss %XMM0, DWORD PTR [%ESP + 4] - shufps %XMM0, %XMM0, 81 - ret - -... which takes advantage of the zero elements provided by movss. -Even xoring a register and shufps'ing IT would be better than the -above code. - -Likewise, for this: - -vector float test(float a, float b) { - return (vector float){ b, a, 0.0, 0.0}; -} - -_test: -pxor %XMM0, %XMM0 -movss %XMM1, %XMM0 -movss %XMM2, DWORD PTR [%ESP + 4] -unpcklps %XMM2, %XMM1 -movss %XMM0, DWORD PTR [%ESP + 8] -unpcklps %XMM0, %XMM1 -unpcklps %XMM0, %XMM2 -ret - -... where we do use pxor, it would be better to use the zero'd -elements that movss provides to turn this into 2 shufps's instead -of 3 unpcklps's. - -Another example: {0.0, 0.0, a, b } - -//===-===// - Consider: __m128 test(float a) { ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/lib/Target/X86/X86ISelLowering.cpp
Changes in directory llvm/lib/Target/X86: X86ISelLowering.cpp updated: 1.184 -> 1.185 --- Log message: A little bit more build_vector enhancement for v8i16 cases. --- Diffs of the changes: (+105 -42) X86ISelLowering.cpp | 147 +--- 1 files changed, 105 insertions(+), 42 deletions(-) Index: llvm/lib/Target/X86/X86ISelLowering.cpp diff -u llvm/lib/Target/X86/X86ISelLowering.cpp:1.184 llvm/lib/Target/X86/X86ISelLowering.cpp:1.185 --- llvm/lib/Target/X86/X86ISelLowering.cpp:1.184 Sun Apr 23 01:35:19 2006 +++ llvm/lib/Target/X86/X86ISelLowering.cpp Mon Apr 24 13:01:45 2006 @@ -2154,6 +2154,78 @@ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); } +/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. +/// +static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG) { + if (NumNonZero > 8) +return SDOperand(); + + SDOperand V(0, 0); + bool First = true; + for (unsigned i = 0; i < 16; ++i) { +bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; +if (ThisIsNonZero && First) { + if (NumZero) +V = getZeroVector(MVT::v8i16, DAG); + else +V = DAG.getNode(ISD::UNDEF, MVT::v8i16); + First = false; +} + +if ((i & 1) != 0) { + SDOperand ThisElt(0, 0), LastElt(0, 0); + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + if (LastIsNonZero) { +LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1)); + } + if (ThisIsNonZero) { +ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i)); +ThisElt = DAG.getNode(ISD::SHL, MVT::i16, + ThisElt, DAG.getConstant(8, MVT::i8)); +if (LastIsNonZero) + ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt); + } else +ThisElt = LastElt; + + if (ThisElt.Val) +V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt, +DAG.getConstant(i/2, MVT::i32)); +} + } + + return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V); +} + +/// LowerBuildVectorv16i8 - Custom lower build_vector of v8i16. +/// +static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG) { + if (NumNonZero > 4) +return SDOperand(); + + SDOperand V(0, 0); + bool First = true; + for (unsigned i = 0; i < 8; ++i) { +bool isNonZero = (NonZeros & (1 << i)) != 0; +if (isNonZero) { + if (First) { +if (NumZero) + V = getZeroVector(MVT::v8i16, DAG); +else + V = DAG.getNode(ISD::UNDEF, MVT::v8i16); +First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i), + DAG.getConstant(i, MVT::i32)); +} + } + + return V; +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { @@ -3152,38 +3224,49 @@ return SDOperand(); } case ISD::BUILD_VECTOR: { +// All zero's are handled with pxor. +if (ISD::isBuildVectorAllZeros(Op.Val)) + return Op; + // All one's are handled with pcmpeqd. if (ISD::isBuildVectorAllOnes(Op.Val)) return Op; -unsigned NumElems = Op.getNumOperands(); -if (NumElems == 2) - return SDOperand(); - -unsigned Half = NumElems/2; MVT::ValueType VT = Op.getValueType(); MVT::ValueType EVT = MVT::getVectorBaseType(VT); +unsigned EVTBits = MVT::getSizeInBits(EVT); + +// Let legalizer expand 2-widde build_vector's. +if (EVTBits == 64) + return SDOperand(); + +unsigned NumElems = Op.getNumOperands(); unsigned NumZero = 0; +unsigned NumNonZero = 0; unsigned NonZeros = 0; std::set Values; for (unsigned i = 0; i < NumElems; ++i) { SDOperand Elt = Op.getOperand(i); - Values.insert(Elt); - if (isZeroNode(Elt)) -NumZero++; - else if (Elt.getOpcode() != ISD::UNDEF) -NonZeros |= (1 << i); + if (Elt.getOpcode() != ISD::UNDEF) { +Values.insert(Elt); +if (isZeroNode(Elt)) + NumZero++; +else { + NonZeros |= (1 << i); + NumNonZero++; +} + } } -unsigned NumNonZero = CountPopulation_32(NonZeros); if (NumNonZero == 0) - return Op; + // Must be a mix of zero and undef. Return a zero vector. + return getZeroVector(VT, DAG); // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) return SDOperand(); -// If element VT is >= 32 bits, turn it into a number of shuffles. +// Special case for single non-zer
[llvm-commits] CVS: llvm/lib/Target/X86/X86InstrSSE.td
Changes in directory llvm/lib/Target/X86: X86InstrSSE.td updated: 1.110 -> 1.111 --- Log message: Some missing movlps, movhps, movlpd, and movhpd patterns. --- Diffs of the changes: (+14 -6) X86InstrSSE.td | 20 ++-- 1 files changed, 14 insertions(+), 6 deletions(-) Index: llvm/lib/Target/X86/X86InstrSSE.td diff -u llvm/lib/Target/X86/X86InstrSSE.td:1.110 llvm/lib/Target/X86/X86InstrSSE.td:1.111 --- llvm/lib/Target/X86/X86InstrSSE.td:1.110Thu Apr 20 20:05:10 2006 +++ llvm/lib/Target/X86/X86InstrSSE.td Mon Apr 24 16:58:20 2006 @@ -2462,18 +2462,26 @@ MOVHP_shuffle_mask)), (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, - MOVL_shuffle_mask)), - (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), MOVLP_shuffle_mask)), (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), + MOVHP_shuffle_mask)), + (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; +def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + +// Setting the lowest element in the vector. +def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)), + (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, MOVL_shuffle_mask)), (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), - MOVHP_shuffle_mask)), - (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; } // 128-bit logical shifts ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/lib/Target/X86/X86ISelLowering.cpp
Changes in directory llvm/lib/Target/X86: X86ISelLowering.cpp updated: 1.185 -> 1.186 --- Log message: Special case handling two wide build_vector(0, x). --- Diffs of the changes: (+4 -4) X86ISelLowering.cpp |8 1 files changed, 4 insertions(+), 4 deletions(-) Index: llvm/lib/Target/X86/X86ISelLowering.cpp diff -u llvm/lib/Target/X86/X86ISelLowering.cpp:1.185 llvm/lib/Target/X86/X86ISelLowering.cpp:1.186 --- llvm/lib/Target/X86/X86ISelLowering.cpp:1.185 Mon Apr 24 13:01:45 2006 +++ llvm/lib/Target/X86/X86ISelLowering.cpp Mon Apr 24 17:58:52 2006 @@ -3236,10 +3236,6 @@ MVT::ValueType EVT = MVT::getVectorBaseType(VT); unsigned EVTBits = MVT::getSizeInBits(EVT); -// Let legalizer expand 2-widde build_vector's. -if (EVTBits == 64) - return SDOperand(); - unsigned NumElems = Op.getNumOperands(); unsigned NumZero = 0; unsigned NumNonZero = 0; @@ -3291,6 +3287,10 @@ } } +// Let legalizer expand 2-widde build_vector's. +if (EVTBits == 64) + return SDOperand(); + // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8) { SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG); ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/test/Regression/CodeGen/X86/vec_set-7.ll
Changes in directory llvm/test/Regression/CodeGen/X86: vec_set-7.ll added (r1.1) --- Log message: Added a movq test case. --- Diffs of the changes: (+10 -0) vec_set-7.ll | 10 ++ 1 files changed, 10 insertions(+) Index: llvm/test/Regression/CodeGen/X86/vec_set-7.ll diff -c /dev/null llvm/test/Regression/CodeGen/X86/vec_set-7.ll:1.1 *** /dev/null Mon Apr 24 18:03:32 2006 --- llvm/test/Regression/CodeGen/X86/vec_set-7.ll Mon Apr 24 18:03:22 2006 *** *** 0 --- 1,10 + ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq | wc -l | grep 1 + + <2 x long> %test(<2 x long>* %p) { + %tmp = cast <2 x long>* %p to double* + %tmp = load double* %tmp + %tmp = insertelement <2 x double> undef, double %tmp, uint 0 + %tmp5 = insertelement <2 x double> %tmp, double 0.00e+00, uint 1 + %tmp = cast <2 x double> %tmp5 to <2 x long> + ret <2 x long> %tmp + } ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/lib/Target/X86/README.txt
Changes in directory llvm/lib/Target/X86: README.txt updated: 1.97 -> 1.98 --- Log message: Add a new entry. --- Diffs of the changes: (+32 -0) README.txt | 32 1 files changed, 32 insertions(+) Index: llvm/lib/Target/X86/README.txt diff -u llvm/lib/Target/X86/README.txt:1.97 llvm/lib/Target/X86/README.txt:1.98 --- llvm/lib/Target/X86/README.txt:1.97 Mon Apr 24 12:38:16 2006 +++ llvm/lib/Target/X86/README.txt Mon Apr 24 18:30:10 2006 @@ -1075,3 +1075,35 @@ There is also one case we do worse on PPC. //===-===// + +For this: + +#include +void test(__m128d *r, __m128d *A, double B) { + *r = _mm_loadl_pd(*A, &B); +} + +We generates: + + subl $12, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 20(%esp), %eax + movapd (%eax), %xmm0 + movlpd (%esp), %xmm0 + movl 16(%esp), %eax + movapd %xmm0, (%eax) + addl $12, %esp + ret + +icc generates: + +movl 4(%esp), %edx #3.6 +movl 8(%esp), %eax #3.6 +movapd(%eax), %xmm0 #4.22 +movlpd12(%esp), %xmm0 #4.8 +movapd%xmm0, (%edx) #4.3 +ret #5.1 + +So icc is smart enough to know that B is in memory so it doesn't load it and +store it back to stack. ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/include/llvm/IntrinsicsX86.td
Changes in directory llvm/include/llvm: IntrinsicsX86.td updated: 1.29 -> 1.30 --- Log message: Added X86 SSE2 intrinsics which can be represented as vector_shuffles. This is a temporary workaround for the 2-wide vector_shuffle problem (i.e. its mask would have type v2i32 which is not legal). --- Diffs of the changes: (+29 -1) IntrinsicsX86.td | 30 +- 1 files changed, 29 insertions(+), 1 deletion(-) Index: llvm/include/llvm/IntrinsicsX86.td diff -u llvm/include/llvm/IntrinsicsX86.td:1.29 llvm/include/llvm/IntrinsicsX86.td:1.30 --- llvm/include/llvm/IntrinsicsX86.td:1.29 Fri Apr 14 16:59:03 2006 +++ llvm/include/llvm/IntrinsicsX86.td Mon Apr 24 18:34:56 2006 @@ -445,7 +445,6 @@ def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">, Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - // FIXME: Temporary workaround since 2-wide shuffle is broken. def int_x86_sse2_movl_dq : GCCBuiltin<"__builtin_ia32_movqv4si">, Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">, @@ -463,6 +462,35 @@ Intrinsic<[llvm_void_ty], [IntrWriteMem]>; } +// Shuffles. +// FIXME: Temporary workarounds since 2-wide shuffle is broken. +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_sse2_movs_d : GCCBuiltin<"__builtin_ia32_movsd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_loadh_pd : GCCBuiltin<"__builtin_ia32_loadhpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_ptr_ty], [IntrReadMem]>; + def int_x86_sse2_loadl_pd : GCCBuiltin<"__builtin_ia32_loadlpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_ptr_ty], [IntrReadMem]>; + def int_x86_sse2_shuf_pd : GCCBuiltin<"__builtin_ia32_shufpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty, llvm_int_ty], [IntrNoMem]>; + def int_x86_sse2_unpckh_pd : GCCBuiltin<"__builtin_ia32_unpckhpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_unpckl_pd : GCCBuiltin<"__builtin_ia32_unpcklpd">, + Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, + llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_sse2_punpckh_qdq : GCCBuiltin<"__builtin_ia32_punpckhqdq128">, + Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_sse2_punpckl_qdq : GCCBuiltin<"__builtin_ia32_punpcklqdq128">, + Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty], [IntrNoMem]>; +} + //===--===// // SSE3 ___ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
[llvm-commits] CVS: llvm/lib/Target/X86/X86InstrSSE.td
Changes in directory llvm/lib/Target/X86: X86InstrSSE.td updated: 1.111 -> 1.112 --- Log message: Added X86 SSE2 intrinsics which can be represented as vector_shuffles. This is a temporary workaround for the 2-wide vector_shuffle problem (i.e. its mask would have type v2i32 which is not legal). --- Diffs of the changes: (+44 -12) X86InstrSSE.td | 56 1 files changed, 44 insertions(+), 12 deletions(-) Index: llvm/lib/Target/X86/X86InstrSSE.td diff -u llvm/lib/Target/X86/X86InstrSSE.td:1.111 llvm/lib/Target/X86/X86InstrSSE.td:1.112 --- llvm/lib/Target/X86/X86InstrSSE.td:1.111Mon Apr 24 16:58:20 2006 +++ llvm/lib/Target/X86/X86InstrSSE.td Mon Apr 24 18:34:56 2006 @@ -2212,11 +2212,6 @@ "movq {$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; -// FIXME: Temporary workaround since 2-wide shuffle is broken. -def MOVLQ128rr : PDI<0xD6, MRMSrcReg, (ops VR128:$dst, VR128:$src), - "movq {$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>; - // Move to lower bits of a VR128 and zeroing upper bits. // Loading from memory automatically zeroing upper bits. let AddedComplexity = 20 in { @@ -2241,13 +2236,16 @@ [(set VR128:$dst, (v4i32 (vector_shuffle immAllZerosV, (v4i32 (scalar_to_vector (loadi32 addr:$src))), MOVL_shuffle_mask)))]>; -def MOVZQI2PQIrr : PDI<0x7E, MRMSrcMem, (ops VR128:$dst, VR64:$src), - "movq {$src, $dst|$dst, $src}", []>; -def MOVZQI2PQIrm : PDI<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), - "movq {$src, $dst|$dst, $src}", - [(set VR128:$dst, (bc_v2i64 (vector_shuffle immAllZerosV, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), - MOVL_shuffle_mask)))]>; +// Moving from XMM to XMM but still clear upper 64 bits. +def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>, + XS, Requires<[HasSSE2]>; +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq + (bc_v4i32 (loadv2i64 addr:$src]>, + XS, Requires<[HasSSE2]>; } //===--===// @@ -2482,8 +2480,42 @@ def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, MOVL_shuffle_mask)), (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; + +// Set lowest element and zero upper elements. +def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), + MOVL_shuffle_mask)), + (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>; } +// FIXME: Temporary workaround since 2-wide shuffle is broken. +def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2), + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2), + (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2), + (UNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)), + (UNPCKHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2), + (UNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)), + (UNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2), + (PUNPCKHQDQrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)), + (PUNPCKHQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2), + (PUNPCKLQDQrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR
[llvm-commits] CVS: llvm/lib/Target/X86/X86InstrSSE.td
Changes in directory llvm/lib/Target/X86: X86InstrSSE.td updated: 1.112 -> 1.113 --- Log message: Explicitly specify result type for def : Pat<> patterns (if it produces a vector result). Otherwise tblgen will pick the default (v16i8 for 128-bit vector). --- Diffs of the changes: (+47 -45) X86InstrSSE.td | 92 + 1 files changed, 47 insertions(+), 45 deletions(-) Index: llvm/lib/Target/X86/X86InstrSSE.td diff -u llvm/lib/Target/X86/X86InstrSSE.td:1.112 llvm/lib/Target/X86/X86InstrSSE.td:1.113 --- llvm/lib/Target/X86/X86InstrSSE.td:1.112Mon Apr 24 18:34:56 2006 +++ llvm/lib/Target/X86/X86InstrSSE.td Mon Apr 24 19:50:01 2006 @@ -2281,9 +2281,9 @@ // Scalar to v8i16 / v16i8. The source may be a R32, but only the lower 8 or // 16-bits matter. -def : Pat<(v8i16 (X86s2vec R32:$src)), (MOVDI2PDIrr R32:$src)>, +def : Pat<(v8i16 (X86s2vec R32:$src)), (v8i16 (MOVDI2PDIrr R32:$src))>, Requires<[HasSSE2]>; -def : Pat<(v16i8 (X86s2vec R32:$src)), (MOVDI2PDIrr R32:$src)>, +def : Pat<(v16i8 (X86s2vec R32:$src)), (v16i8 (MOVDI2PDIrr R32:$src))>, Requires<[HasSSE2]>; // bit_convert @@ -2353,17 +2353,17 @@ let AddedComplexity = 20 in { def : Pat<(v8i16 (vector_shuffle immAllZerosV, (v8i16 (X86s2vec R32:$src)), MOVL_shuffle_mask)), - (MOVZDI2PDIrr R32:$src)>, Requires<[HasSSE2]>; + (v8i16 (MOVZDI2PDIrr R32:$src))>, Requires<[HasSSE2]>; def : Pat<(v16i8 (vector_shuffle immAllZerosV, (v16i8 (X86s2vec R32:$src)), MOVL_shuffle_mask)), - (MOVZDI2PDIrr R32:$src)>, Requires<[HasSSE2]>; + (v16i8 (MOVZDI2PDIrr R32:$src))>, Requires<[HasSSE2]>; // Zeroing a VR128 then do a MOVS{S|D} to the lower bits. def : Pat<(v2f64 (vector_shuffle immAllZerosV, (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)), - (MOVLSD2PDrr (V_SET0_PD), FR64:$src)>, Requires<[HasSSE2]>; + (v2f64 (MOVLSD2PDrr (V_SET0_PD), FR64:$src))>, Requires<[HasSSE2]>; def : Pat<(v4f32 (vector_shuffle immAllZerosV, (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)), - (MOVLSS2PSrr (V_SET0_PS), FR32:$src)>, Requires<[HasSSE2]>; + (v4f32 (MOVLSS2PSrr (V_SET0_PS), FR32:$src))>, Requires<[HasSSE2]>; } // Splat v2f64 / v2i64 @@ -2404,115 +2404,117 @@ let AddedComplexity = 10 in { def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; + (v4f32 (UNPCKLPSrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; + (v16i8 (PUNPCKLBWrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; + (v8i16 (PUNPCKLWDrr VR128:$src, VR128:$src))>, Requires<[HasSSE2]>; def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), - (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>; + (v4i32 (PUNPCKLDQrr VR128:$src, VR128:$src))>, Requires<[HasSSE1]>; } let AddedComplexity = 20 in { // vector_shuffle v1, <1, 1, 3, 3> def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), MOVSHDUP_shuffle_mask)), - (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>; + (v4i32 (MOVSHDUPrr VR128:$src))>, Requires<[HasSSE3]>; def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), MOVSHDUP_shuffle_mask)), - (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>; + (v4i32 (MOVSHDUPrm addr:$src))>, Requires<[HasSSE3]>; // vector_shuffle v1, <0, 0, 2, 2> def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), MOVSLDUP_shuffle_mask)), - (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>; + (v4i32 (MOVSLDUPrr VR128:$src))>, Requires<[HasSSE3]>; def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), MOVSLDUP_shuffle_mask)), - (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>; + (v4i32 (MOVSLDUPrm addr:$src))>, Requires<[HasSSE3]>; } let AddedComplexity = 20 in { // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVHP_shuffle_mask)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; + (v4i32 (MOVLHPSrr VR128:$src1, VR128:$src2))>; // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVHLPS_shuffle_mask)), - (MOVHLPSrr VR128:$src1, VR128:$src2)>; + (v4i32 (M
[llvm-commits] CVS: llvm/lib/Target/PowerPC/PPCJITInfo.cpp
Changes in directory llvm/lib/Target/PowerPC: PPCJITInfo.cpp updated: 1.18 -> 1.19 --- Log message: No functionality changes, but cleaner code with correct comments. --- Diffs of the changes: (+41 -35) PPCJITInfo.cpp | 76 ++--- 1 files changed, 41 insertions(+), 35 deletions(-) Index: llvm/lib/Target/PowerPC/PPCJITInfo.cpp diff -u llvm/lib/Target/PowerPC/PPCJITInfo.cpp:1.18 llvm/lib/Target/PowerPC/PPCJITInfo.cpp:1.19 --- llvm/lib/Target/PowerPC/PPCJITInfo.cpp:1.18 Fri Apr 21 17:04:15 2006 +++ llvm/lib/Target/PowerPC/PPCJITInfo.cpp Mon Apr 24 23:45:59 2006 @@ -86,49 +86,55 @@ #endif extern "C" void PPC32CompilationCallbackC(unsigned *IntRegs, double *FPRegs) { - unsigned *CameFromStub = (unsigned*)__builtin_return_address(0+1); - unsigned *CameFromOrig = (unsigned*)__builtin_return_address(1+1); - unsigned *CCStackPtr = (unsigned*)__builtin_frame_address(0); -//unsigned *StubStackPtr = (unsigned*)__builtin_frame_address(1); - unsigned *OrigStackPtr = (unsigned*)__builtin_frame_address(2+1); - - // Adjust pointer to the branch, not the return address. - --CameFromStub; - - void *Target = JITCompilerFunction(CameFromStub); - - // Check to see if CameFromOrig[-1] is a 'bl' instruction, and if we can - // rewrite it to branch directly to the destination. If so, rewrite it so it - // does not need to go through the stub anymore. - unsigned CameFromOrigInst = CameFromOrig[-1]; - if ((CameFromOrigInst >> 26) == 18) { // Direct call. -intptr_t Offset = ((intptr_t)Target-(intptr_t)CameFromOrig+4) >> 2; + unsigned *StubCallAddrPlus4 = (unsigned*)__builtin_return_address(0+1); + unsigned *OrigCallAddrPlus4 = (unsigned*)__builtin_return_address(1+1); + unsigned *CurStackPtr = (unsigned*)__builtin_frame_address(0); + unsigned *OrigStackPtr = (unsigned*)__builtin_frame_address(2+1); + + // Adjust the pointer to the address of the call instruction in the stub + // emitted by emitFunctionStub, rather than the instruction after it. + unsigned *StubCallAddr = StubCallAddrPlus4 - 1; + unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1; + + void *Target = JITCompilerFunction(StubCallAddr); + + // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite + // it to branch directly to the destination. If so, rewrite it so it does not + // need to go through the stub anymore. + unsigned OrigCallInst = *OrigCallAddr; + if ((OrigCallInst >> 26) == 18) { // Direct call. +intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2; + if (Offset >= -(1 << 23) && Offset < (1 << 23)) { // In range? // Clear the original target out. - CameFromOrigInst &= (63 << 26) | 3; + OrigCallInst &= (63 << 26) | 3; // Fill in the new target. - CameFromOrigInst |= (Offset & ((1 << 24)-1)) << 2; + OrigCallInst |= (Offset & ((1 << 24)-1)) << 2; // Replace the call. - CameFromOrig[-1] = CameFromOrigInst; + *OrigCallAddr = OrigCallInst; } } - // Locate the start of the stub. If this is a short call, adjust backwards - // the short amount, otherwise the full amount. - bool isShortStub = (*CameFromStub >> 26) == 18; - CameFromStub -= isShortStub ? 2 : 6; + // Assert that we are coming from a stub that was created with our + // emitFunctionStub. + assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!"); + StubCallAddr -= 6; // Rewrite the stub with an unconditional branch to the target, for any users // who took the address of the stub. - EmitBranchToAt(CameFromStub, Target, false); + EmitBranchToAt(StubCallAddr, Target, false); - // Change the SP so that we pop two stack frames off when we return. - *CCStackPtr = (intptr_t)OrigStackPtr; - - // Put the address of the stub and the LR value that originally came into the - // stub in a place that is easy to get on the stack after we restore all regs. - CCStackPtr[2] = (intptr_t)Target; - CCStackPtr[1] = (intptr_t)CameFromOrig; + // Change the stored stack pointer so that we pop three stack frames: + // 1. PPC32CompilationCallbackC's frame + // 2. _PPC32CompilationCallback's frame + // 3. the stub's frame + *CurStackPtr = (intptr_t)OrigStackPtr; + + // Put the address of the target function to call and the address to return to + // after calling the target function in a place that is easy to get on the + // stack after we restore all regs. + CurStackPtr[2] = (intptr_t)Target; + CurStackPtr[1] = (intptr_t)OrigCallAddrPlus4; // Note, this is not a standard epilog! #if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) @@ -141,12 +147,12 @@ "lfd f10, 72(%0)\n" "lfd f11, 80(%0)\n" "lfd f12, 88(%0)\n" "lfd f13, 96(%0)\n" "lmw r3, 0(%1)\n" // Load all integer regs - "lwz r0,4(r1)\n" // Get CameFromOrig (LR into stub) + "lwz r0,4(r1)\n" // Get OrigCallAddrPlus4 (LR value when stub was called) "mtlr r0\n"