Hi Rafael, Are you sure this is better? Did you do any measurement? Our goal isn't to match gcc output. :)
Perhaps you can add some unit tests to llvm-test? Thanks, Evan On Sep 28, 2007, at 5:53 AM, Rafael Espindola <[EMAIL PROTECTED] > wrote: > Author: rafael > Date: Fri Sep 28 07:53:01 2007 > New Revision: 42433 > > URL: http://llvm.org/viewvc/llvm-project?rev=42433&view=rev > Log: > Refactor the memcpy lowering for the x86 target. > > The only generated code difference is that now we call memcpy when > the size of the array is unknown. This matches GCC behavior and is > better since the run time value can be arbitrarily large. > > > Added: > llvm/trunk/test/CodeGen/X86/memcpy.ll > Modified: > llvm/trunk/lib/Target/X86/X86ISelLowering.cpp > llvm/trunk/lib/Target/X86/X86ISelLowering.h > > Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp > URL: > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=42433&r1=42432&r2=42433&view=diff > > === > === > === > ===================================================================== > --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original) > +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 28 > 07:53:01 2007 > @@ -4188,35 +4188,61 @@ > } > > SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG > &DAG) { > - SDOperand Chain = Op.getOperand(0); > - unsigned Align = > - (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue(); > + SDOperand ChainOp = Op.getOperand(0); > + SDOperand DestOp = Op.getOperand(1); > + SDOperand SourceOp = Op.getOperand(2); > + SDOperand CountOp = Op.getOperand(3); > + SDOperand AlignOp = Op.getOperand(4); > + unsigned Align = (unsigned)cast<ConstantSDNode>(AlignOp)->getValue > (); > if (Align == 0) Align = 1; > > - ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3)); > - // If not DWORD aligned or size is more than the threshold, call > memcpy. > - // The libc version is likely to be faster for these cases. It > can use the > - // address value and run time information about the CPU. > + // The libc version is likely to be faster for the following > cases. It can > + // use the address value and run time information about the CPU. > // With glibc 2.6.1 on a core 2, coping an array of 100M longs was > 30% faster > - if ((Align & 3) != 0 || > - (I && I->getValue() > Subtarget->getMinRepStrSizeThreshold > ())) { > - MVT::ValueType IntPtr = getPointerTy(); > - TargetLowering::ArgListTy Args; > - TargetLowering::ArgListEntry Entry; > - Entry.Ty = getTargetData()->getIntPtrType(); > - Entry.Node = Op.getOperand(1); Args.push_back(Entry); > - Entry.Node = Op.getOperand(2); Args.push_back(Entry); > - Entry.Node = Op.getOperand(3); Args.push_back(Entry); > - std::pair<SDOperand,SDOperand> CallResult = > + > + // If not DWORD aligned, call memcpy. > + if ((Align & 3) != 0) > + return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); > + > + // If size is unknown, call memcpy. > + ConstantSDNode *I = dyn_cast<ConstantSDNode>(CountOp); > + if (!I) > + return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); > + > + // If size is more than the threshold, call memcpy. > + unsigned Size = I->getValue(); > + if (Size > Subtarget->getMinRepStrSizeThreshold()) > + return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); > + > + return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, > DAG); > +} > + > +SDOperand X86TargetLowering::LowerMEMCPYCall(SDOperand Chain, > + SDOperand Dest, > + SDOperand Source, > + SDOperand Count, > + SelectionDAG &DAG) { > + MVT::ValueType IntPtr = getPointerTy(); > + TargetLowering::ArgListTy Args; > + TargetLowering::ArgListEntry Entry; > + Entry.Ty = getTargetData()->getIntPtrType(); > + Entry.Node = Dest; Args.push_back(Entry); > + Entry.Node = Source; Args.push_back(Entry); > + Entry.Node = Count; Args.push_back(Entry); > + std::pair<SDOperand,SDOperand> CallResult = > LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, > false, > DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG); > - return CallResult.second; > - } > + return CallResult.second; > +} > > +SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain, > + SDOperand Dest, > + SDOperand Source, > + unsigned Size, > + unsigned Align, > + SelectionDAG &DAG) { > MVT::ValueType AVT; > - SDOperand Count; > unsigned BytesLeft = 0; > - bool TwoRepMovs = false; > switch (Align & 3) { > case 2: // WORD aligned > AVT = MVT::i16; > @@ -4228,33 +4254,22 @@ > break; > default: // Byte aligned > AVT = MVT::i8; > - Count = Op.getOperand(3); > break; > } > > - if (AVT > MVT::i8) { > - if (I) { > - unsigned UBytes = MVT::getSizeInBits(AVT) / 8; > - Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy > ()); > - BytesLeft = I->getValue() % UBytes; > - } else { > - assert(AVT >= MVT::i32 && > - "Do not use rep;movs if not at least DWORD aligned"); > - Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(), > - Op.getOperand(3), DAG.getConstant(2, > MVT::i8)); > - TwoRepMovs = true; > - } > - } > + unsigned UBytes = MVT::getSizeInBits(AVT) / 8; > + SDOperand Count = DAG.getConstant(Size / UBytes, getPointerTy()); > + BytesLeft = Size % UBytes; > > SDOperand InFlag(0, 0); > Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : > X86::ECX, > Count, InFlag); > InFlag = Chain.getValue(1); > Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : > X86::EDI, > - Op.getOperand(1), InFlag); > + Dest, InFlag); > InFlag = Chain.getValue(1); > Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : > X86::ESI, > - Op.getOperand(2), InFlag); > + Source, InFlag); > InFlag = Chain.getValue(1); > > SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); > @@ -4264,27 +4279,12 @@ > Ops.push_back(InFlag); > Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()); > > - if (TwoRepMovs) { > - InFlag = Chain.getValue(1); > - Count = Op.getOperand(3); > - MVT::ValueType CVT = Count.getValueType(); > - SDOperand Left = DAG.getNode(ISD::AND, CVT, Count, > - DAG.getConstant((AVT == MVT::i64) ? > 7 : 3, CVT)); > - Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : > X86::ECX, > - Left, InFlag); > - InFlag = Chain.getValue(1); > - Tys = DAG.getVTList(MVT::Other, MVT::Flag); > - Ops.clear(); > - Ops.push_back(Chain); > - Ops.push_back(DAG.getValueType(MVT::i8)); > - Ops.push_back(InFlag); > - Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()); > - } else if (BytesLeft) { > + if (BytesLeft) { > // Issue loads and stores for the last 1 - 7 bytes. > - unsigned Offset = I->getValue() - BytesLeft; > - SDOperand DstAddr = Op.getOperand(1); > + unsigned Offset = Size - BytesLeft; > + SDOperand DstAddr = Dest; > MVT::ValueType DstVT = DstAddr.getValueType(); > - SDOperand SrcAddr = Op.getOperand(2); > + SDOperand SrcAddr = Source; > MVT::ValueType SrcVT = SrcAddr.getValueType(); > SDOperand Value; > if (BytesLeft >= 4) { > > Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h > URL: > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=42433&r1=42432&r2=42433&view=diff > > === > === > === > ===================================================================== > --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original) > +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Fri Sep 28 07:53:01 > 2007 > @@ -437,6 +437,12 @@ > SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerBRCOND_New(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG); > + SDOperand LowerMEMCPYInline(SDOperand Dest, SDOperand Source, > + SDOperand Chain, unsigned Size, > unsigned Align, > + SelectionDAG &DAG); > + SDOperand LowerMEMCPYCall(SDOperand ChainOp, SDOperand DestOp, > + SDOperand SourceOp, SDOperand CountOp, > + SelectionDAG &DAG); > SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG); > > Added: llvm/trunk/test/CodeGen/X86/memcpy.ll > URL: > http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memcpy.ll?rev=42433&view=auto > > === > === > === > ===================================================================== > --- llvm/trunk/test/CodeGen/X86/memcpy.ll (added) > +++ llvm/trunk/test/CodeGen/X86/memcpy.ll Fri Sep 28 07:53:01 2007 > @@ -0,0 +1,17 @@ > +; RUN: llvm-as < %s | llc -march=x86-64 | grep call.*memcpy | count 2 > + > +declare void @llvm.memcpy.i64(i8*, i8*, i64, i32) > + > +define i8* @my_memcpy(i8* %a, i8* %b, i64 %n) { > +entry: > + tail call void @llvm.memcpy.i64( i8* %a, i8* %b, i64 %n, i32 1 ) > + ret i8* %a > +} > + > +define i8* @my_memcpy2(i64* %a, i64* %b, i64 %n) { > +entry: > + %tmp14 = bitcast i64* %a to i8* > + %tmp25 = bitcast i64* %b to i8* > + tail call void @llvm.memcpy.i64(i8* %tmp14, i8* %tmp25, i64 %n, > i32 8 ) > + ret i8* %tmp14 > +} > > > _______________________________________________ > llvm-commits mailing list > llvm-commits@cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits