I think we could do something else (which may even cover more cases): 1. try to use a shl (we already do that)
2 use shladd for all negative imms with for all power of two negative immediates (are we already doing it? I think we miss a lot of opts where "worse" instructions could include modifiers and we basically save neg/abs instructions. OP_SHL can't take a neg, so we have to use OP_SHLADD for that) b = shladd(neg a, log2(imm), 0x0) 3. for all immediates in [0xffff, 0x0]: t = xmad(a, imm, 0x0) b = xmad.PSL(a.hi, imm, t) which should be already quite good. I don't know if using shifts or shladd is faster then xmad, so without benchmarks I wouldn't want to include more complex optimizations if we don't know they pay off. Nvidia doesn't seem to do that either, but they use shladd in that negative immediate case. Maybe they don't do it for the trivial case shladd(a, log2(imm), a) for power of two + 1 because they simply don't care. Don't know. I am sure it would be worth it to see where it actually makes a different adding your opts after the three above ones. Or maybe yours is also faster than the third one above. I don't know. On Mon, Jul 23, 2018 at 12:40 PM, Rhys Perry <pendingchao...@gmail.com> wrote: > Strongly mitigates the harm from the previous commit, which made many > integer multiplications much more heavy on the register and instruction > count. > > total instructions in shared programs : 5839715 -> 5801926 (-0.65%) > total gprs used in shared programs : 670553 -> 669853 (-0.10%) > total shared used in shared programs : 548832 -> 548832 (0.00%) > total local used in shared programs : 21164 -> 21068 (-0.45%) > > local shared gpr inst bytes > helped 1 0 408 2522 2522 > hurt 0 0 232 23 23 > > Signed-off-by: Rhys Perry <pendingchao...@gmail.com> > --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 126 > ++++++++++++++++++--- > 1 file changed, 112 insertions(+), 14 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index a6ddb284b8..e5d033c9b0 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -379,6 +379,10 @@ private: > > CmpInstruction *findOriginForTestWithZero(Value *); > > + Value *createMulMethod1(Value *a, unsigned b, Value *c); > + Value *createMulMethod2(Value *a, unsigned b, Value *c); > + Value *createMul(Value *a, unsigned b, Value *c); > + > unsigned int foldCount; > > BuildUtil bld; > @@ -953,6 +957,88 @@ ConstantFolding::opnd3(Instruction *i, ImmediateValue > &imm2) > } > } > > +Value * > +ConstantFolding::createMulMethod1(Value *a, unsigned b, Value *c) > +{ > + // basically constant folded shift and add multiplication. > + Value *res = c ? c : bld.loadImm(NULL, 0u); > + bool resZero = !c; > + unsigned ashift = 0; > + while (b) { > + if ((b & 1) && ashift) { > + Value *sh = bld.loadImm(NULL, ashift); > + if (resZero) > + res = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, sh); > + else > + res = bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, sh, res); > + resZero = false; > + } else if (b & 1) { > + if (resZero) > + res = a; > + else > + res = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), res, a); > + resZero = false; > + } > + b >>= 1; > + ashift++; > + } > + return res; > +} > + > +Value * > +ConstantFolding::createMulMethod2(Value *a, unsigned b, Value *c) > +{ > + // basically does a * b2 - a * (b2 - b) + c > + uint64_t b2 = util_next_power_of_two64(b); > + unsigned b2shift = ffsll(b2) - 1; > + > + Value *mul1 = createMulMethod1(a, b2 - b, NULL); > + > + Value *res; > + if (b2shift < 32) { > + Instruction *i = bld.mkOp3(OP_SHLADD, TYPE_U32, bld.getSSA(), > + a, bld.loadImm(NULL, b2shift), mul1); > + res = i->getDef(0); > + > + // all targets supporting OP_SHLADD should pass this > + assert(bld.getProgram()->getTarget()->isModSupported(i, 2, > NV50_IR_MOD_NEG)); > + i->src(2).mod *= Modifier(NV50_IR_MOD_NEG); > + } else { > + res = bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1); > + } > + > + if (c) > + res = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), res, c); > + > + return res; > +} > + > +Value * > +ConstantFolding::createMul(Value *a, unsigned b, Value *c) > +{ > + unsigned cost[2]; > + > + // estimate cost for first method (a << i) + (b << j) + ... > + cost[0] = util_bitcount64(b >> 1); > + > + // estimate cost for second method (a << i) - ((a << j) + (a << k) + ...) > + uint64_t rounded_b = util_next_power_of_two64(b); > + cost[1] = rounded_b == b ? 1 : (util_bitcount64((rounded_b - b) >> 1) + > 1); > + if (c) cost[1]++; > + > + // The general method, multiplication by XMADs, costs three instructions. > + // So nothing much larger than that or it could be making things worse. > + if (cost[0] > 3 && cost[1] > 3) > + return NULL; > + > + // the cost is the same for both methods with powers of twos > + // but method 1 creates more optimizable code > + if (cost[0] <= cost[1]) > + return createMulMethod1(a, b, c); > + else > + return createMulMethod2(a, b, c); > +} > + > void > ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) > { > @@ -1040,13 +1126,25 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue > &imm0, int s) > i->setSrc(s, i->getSrc(t)); > i->src(s).mod = i->src(t).mod; > } else > - if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) { > - i->op = OP_SHL; > - imm0.applyLog2(); > - i->setSrc(0, i->getSrc(t)); > - i->src(0).mod = i->src(t).mod; > - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); > - i->src(1).mod = 0; > + if (!isFloatType(i->dType)) { > + bool optimized = false; > + if (target->isOpSupported(OP_SHLADD, TYPE_U32)) { > + bld.setPosition(i, false); > + Value *val = createMul(i->getSrc(t), imm0.reg.data.u32, NULL); > + if (val) { > + i->def(0).replace(val, false); > + delete_Instruction(prog, i); > + optimized = true; > + } > + } > + if (!optimized && !imm0.isNegative() && imm0.isPow2()) { > + i->op = OP_SHL; > + imm0.applyLog2(); > + i->setSrc(0, i->getSrc(t)); > + i->src(0).mod = i->src(t).mod; > + i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); > + i->src(1).mod = 0; > + } > } else > if (i->postFactor && i->sType == TYPE_F32) { > /* Can't emit a postfactor with an immediate, have to fold it in */ > @@ -1079,13 +1177,13 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue > &imm0, int s) > i->setSrc(2, NULL); > i->op = OP_ADD; > } else > - if (s == 1 && !imm0.isNegative() && imm0.isPow2() && > - !isFloatType(i->dType) && > - target->isOpSupported(OP_SHLADD, i->dType) && > - !i->subOp) { > - i->op = OP_SHLADD; > - imm0.applyLog2(); > - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); > + if (!isFloatType(i->dType) && target->isOpSupported(OP_SHLADD, > TYPE_U32) && !i->subOp) { > + bld.setPosition(i, false); > + Value *val = createMul(i->getSrc(t), imm0.reg.data.u32, > i->getSrc(2)); > + if (val) { > + i->def(0).replace(val, false); > + delete_Instruction(prog, i); > + } > } > break; > case OP_SUB: > -- > 2.14.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev