On Mon, Sep 17, 2018 at 11:00 AM, Rhys Perry <pendingchao...@gmail.com> wrote: > NVC0_CB_AUX_BINDLESS_INFO isn't written to on Maxwell+ and it's too small > anyway. > > With these changes, TXQ is used to determine the number of samples and > the coordinate adjustment information looked up in a small array in the > driver constant buffer. > > v2: rework to use TXQ and a small array instead of a larger array with an > entry for each texture > > Signed-off-by: Rhys Perry <pendingchao...@gmail.com> > --- > .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 + > .../codegen/nv50_ir_lowering_gm107.cpp | 4 +-- > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 31 +++++++++++++++++-- > .../nouveau/codegen/nv50_ir_lowering_nvc0.h | 3 +- > .../nouveau/codegen/nv50_ir_peephole.cpp | 1 + > .../drivers/nouveau/nvc0/mme/com9097.mme | 8 ++--- > .../drivers/nouveau/nvc0/mme/com9097.mme.h | 8 ++--- > .../drivers/nouveau/nvc0/nvc0_context.h | 23 ++++++++------ > .../drivers/nouveau/nvc0/nvc0_program.c | 1 + > .../drivers/nouveau/nvc0/nvc0_screen.c | 15 +++++++++ > .../drivers/nouveau/nvc0/nve4_compute.c | 22 +++++++++++++ > 11 files changed, 94 insertions(+), 23 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > index 7c835ceab8..b3da6fc3cf 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > @@ -188,6 +188,7 @@ struct nv50_ir_prog_info > uint8_t msInfoCBSlot; /* cX[] used for multisample info */ > uint16_t msInfoBase; /* base address for multisample info */ > uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */ > + uint16_t msAdjInfoBase; /* base address for MS coordinate > adjustment info */ > } io; > > /* driver callback to assign input/output locations */ > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > index c7436e2e29..49a5f3b01f 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > @@ -320,11 +320,11 @@ GM107LoweringPass::handleSUQ(TexInstruction *suq) > > if (mask & 0x1) > bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0), > - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), > suq->tex.bindless)); > + loadMsAdjInfo32(suq->tex.target, 0, slot, ind, > suq->tex.bindless)); > if (mask & 0x2) { > int d = util_bitcount(mask & 0x1); > bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d), > - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), > suq->tex.bindless)); > + loadMsAdjInfo32(suq->tex.target, 1, slot, ind, > suq->tex.bindless)); > } > } > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index 176e0cf608..5db29ba799 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -1732,6 +1732,33 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, > uint32_t off, bool bindless > prog->driver->io.suInfoBase); > } > > +inline Value * > +NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t > index, int slot, Value *ind, bool bindless) > +{ > + if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET) > + return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless); > + > + assert(bindless); > + > + Value *samples = bld.getSSA(); > + // This shouldn't be lowered because it's being inserted before the > current instruction > + TexInstruction *tex = new_TexInstruction(func, OP_TXQ); > + tex->tex.target = target; > + tex->tex.query = TXQ_TYPE; > + tex->tex.mask = 0x4; > + tex->tex.r = 0xff; > + tex->tex.s = 0x1f; > + tex->tex.rIndirectSrc = 0; > + tex->setDef(0, samples); > + tex->setSrc(0, ind); > + tex->setSrc(1, bld.loadImm(NULL, 0)); > + bld.insert(tex); > + > + // XMAD has a higher throughput than SHL and we shouldn't be dealing with > >65535 integers here > + Value *ptr = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(), samples, > bld.mkImm(8), bld.mkImm(0)); > + return loadResInfo32(ptr, index * 4, prog->driver->io.msAdjInfoBase); > +} > + > static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c) > { > switch (su->tex.target.getEnum()) { > @@ -1817,8 +1844,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction > *tex) > Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA(); > Value *ind = tex->getIndirectR(); > > - Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), > tex->tex.bindless); > - Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), > tex->tex.bindless); > + Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, > tex->tex.bindless); > + Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, > tex->tex.bindless); > > bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); > bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > index 5dbb3e4f00..4136b1ecfe 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > @@ -148,7 +148,7 @@ protected: > void handlePIXLD(Instruction *); > > void checkPredicate(Instruction *); > - Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless); > + Value *loadMsAdjInfo32(TexInstruction::Target targ, uint32_t index, int > slot, Value *ind, bool bindless); > > virtual bool visit(Instruction *); > > @@ -161,6 +161,7 @@ private: > Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base); > Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base); > Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base); > + Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless); > Value *loadBufInfo64(Value *ptr, uint32_t off); > Value *loadBufLength32(Value *ptr, uint32_t off); > Value *loadUboInfo64(Value *ptr, uint32_t off); > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index d851cf3c37..f91c502e9e 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -317,6 +317,7 @@ IndirectPropagation::visit(BasicBlock *bb) > ImmediateValue imm; > if (!i->src(s).isIndirect(0)) > continue; > + > insn = i->getIndirect(s, 0)->getInsn(); > if (!insn) > continue; > diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme > b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme > index 38c2e86843..8ca8f34f9b 100644 > --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme > +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme > @@ -255,7 +255,7 @@ dei_draw_again: > parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ > parm $r4 send $r4 /* index_bias, send start */ > maddr 0x18e3 /* CB_POS */ > - send 0x1a0 /* 256 + 160 */ > + send 0x1e0 /* 256 + 224 */ > braz $r2 #dei_end > parm $r5 send $r4 /* start_instance, send index_bias */ > send $r5 /* send start_instance */ > @@ -311,7 +311,7 @@ dai_draw_again: > braz $r3 #dai_end > parm $r4 send $r4 /* start_instance */ > maddr 0x18e3 /* CB_POS */ > - send 0x1a0 /* 256 + 160 */ > + send 0x1e0 /* 256 + 224 */ > send 0x0 /* send 0 as base_vertex */ > send $r4 /* send start_instance */ > send $r6 /* draw id */ > @@ -374,7 +374,7 @@ deic_draw_again: > parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ > parm $r4 send $r4 /* index_bias, send start */ > maddr 0x18e3 /* CB_POS */ > - send 0x1a0 /* 256 + 160 */ > + send 0x1e0 /* 256 + 224 */ > braz $r2 #deic_end > parm $r5 send $r4 /* start_instance, send index_bias */ > send $r5 /* send start_instance */ > @@ -455,7 +455,7 @@ daic_draw_again: > braz $r3 #daic_end > parm $r4 send $r4 /* start_instance */ > maddr 0x18e3 /* CB_POS */ > - send 0x1a0 /* 256 + 160 */ > + send 0x1e0 /* 256 + 224 */ > send 0x0 /* send 0 as base_vertex */ > send $r4 /* send start_instance */ > send $r6 /* draw id */ > diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h > b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h > index 49c0891114..47c5e6c6e0 100644 > --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h > +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h > @@ -140,7 +140,7 @@ uint32_t mme9097_draw_elts_indirect[] = { > 0x017dc451, > 0x00002431, > 0x0638c021, > - 0x00680041, > + 0x00780041, > 0x0004d007, > 0x00002531, > 0x00002841, > @@ -185,7 +185,7 @@ uint32_t mme9097_draw_arrays_indirect[] = { > 0x0004d807, > 0x00002431, > 0x0638c021, > - 0x00680041, > + 0x00780041, > 0x00000041, > 0x00002041, > 0x00003041, > @@ -233,7 +233,7 @@ uint32_t mme9097_draw_elts_indirect_count[] = { > 0x017dc451, > 0x00002431, > 0x0638c021, > - 0x00680041, > + 0x00780041, > 0x0004d007, > 0x00002531, > 0x00002841, > @@ -300,7 +300,7 @@ uint32_t mme9097_draw_arrays_indirect_count[] = { > 0x0004d807, > 0x00002431, > 0x0638c021, > - 0x00680041, > + 0x00780041, > 0x00000041, > 0x00002041, > 0x00003041, > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > index 77237a3c0a..1d920c26f5 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > @@ -122,35 +122,38 @@ > /* 8 sets of 32-bits coordinate offsets */ > #define NVC0_CB_AUX_MS_INFO 0x0c0 > #define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4) > +/* 8 sets of 32-bit pairs containing coordinate adjustment information */ > +#define NVC0_CB_AUX_MS_ADJ_INFO(i) 0x100 + (i) * 4 * 2 > +#define NVC0_CB_AUX_MS_ADJ_SIZE (8 * 2 * 4) > /* block/grid size, at 3 32-bits integers each, gridid and work_dim */ > -#define NVC0_CB_AUX_GRID_INFO(i) 0x100 + (i) * 4 /* CP */ > +#define NVC0_CB_AUX_GRID_INFO(i) 0x140 + (i) * 4 /* CP */ > #define NVC0_CB_AUX_GRID_SIZE (8 * 4) > /* FB texture handle */ > -#define NVC0_CB_AUX_FB_TEX_INFO 0x100 /* FP */ > +#define NVC0_CB_AUX_FB_TEX_INFO 0x140 /* FP */ > #define NVC0_CB_AUX_FB_TEX_SIZE (4) > /* 8 user clip planes, at 4 32-bits floats each */ > -#define NVC0_CB_AUX_UCP_INFO 0x120 > +#define NVC0_CB_AUX_UCP_INFO 0x160 > #define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4) > /* 13 ubos, at 4 32-bits integer each */ > -#define NVC0_CB_AUX_UBO_INFO(i) 0x120 + (i) * 4 * 4 /* CP */ > +#define NVC0_CB_AUX_UBO_INFO(i) 0x160 + (i) * 4 * 4 /* CP */ > #define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4) > /* 8 sets of 32-bits integer pairs sample offsets */ > -#define NVC0_CB_AUX_SAMPLE_INFO 0x1a0 /* FP */ > +#define NVC0_CB_AUX_SAMPLE_INFO 0x1e0 /* FP */ > /* 256 bytes, though only 64 bytes used before GM200 */ > #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 2 * 4 * 4) > /* draw parameters (index bais, base instance, drawid) */ > -#define NVC0_CB_AUX_DRAW_INFO 0x1a0 /* VP */ > +#define NVC0_CB_AUX_DRAW_INFO 0x1e0 /* VP */ > /* 32 user buffers, at 4 32-bits integers each */ > -#define NVC0_CB_AUX_BUF_INFO(i) 0x2a0 + (i) * 4 * 4 > +#define NVC0_CB_AUX_BUF_INFO(i) 0x2e0 + (i) * 4 * 4 > #define NVC0_CB_AUX_BUF_SIZE (NVC0_MAX_BUFFERS * 4 * 4) > /* 8 surfaces, at 16 32-bits integers each */ > -#define NVC0_CB_AUX_SU_INFO(i) 0x4a0 + (i) * 16 * 4 > +#define NVC0_CB_AUX_SU_INFO(i) 0x4e0 + (i) * 16 * 4 > #define NVC0_CB_AUX_SU_SIZE (NVC0_MAX_IMAGES * 16 * 4) > /* 1 64-bits address and 1 32-bits sequence */ > -#define NVC0_CB_AUX_MP_INFO 0x6a0 > +#define NVC0_CB_AUX_MP_INFO 0x6e0 > #define NVC0_CB_AUX_MP_SIZE 3 * 4 > /* 512 64-byte blocks for bindless image handles */ > -#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4 > +#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6f0 + (i) * 16 * 4 > #define NVC0_CB_AUX_BINDLESS_SIZE (NVE4_IMG_MAX_HANDLES * 16 * 4) > /* 4 32-bits floats for the vertex runout, put at the end */ > #define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6) > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > index 57d98753f4..b3a0954d76 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > @@ -600,6 +600,7 @@ nvc0_program_translate(struct nvc0_program *prog, > uint16_t chipset, > info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; > info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; > info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; > + info->io.msAdjInfoBase = NVC0_CB_AUX_MS_ADJ_INFO(0); > info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); > info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0); > if (info->target >= NVISA_GK104_CHIPSET) { > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c > b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c > index 2eecf59ce0..f67e42052e 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c > @@ -1362,6 +1362,21 @@ nvc0_screen_create(struct nouveau_device *dev) > PUSH_DATA (push, 1); > PUSH_DATA (push, 3); /* 7 */ > PUSH_DATA (push, 1); > + > + /* MS coordinate adjustment information */ > + for (int i = 1; i <= 8; i *= 2) { > + BEGIN_1IC0(push, NVC0_3D(CB_POS), 3); > + PUSH_DATA (push, NVC0_CB_AUX_MS_ADJ_INFO(i)); > + int ms_x = 0, ms_y = 0; > + switch (i) { > + case 1: break; > + case 2: ms_x = 1; break; > + case 4: ms_x = 1; ms_y = 1; break; > + case 8: ms_x = 2; ms_y = 1; break; > + } > + PUSH_DATA(push, ms_x); > + PUSH_DATA(push, ms_y); > + } > } > BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1); > PUSH_DATA (push, 0); > diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > index 8aa8d4936f..b7af7ab0d2 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > @@ -168,6 +168,28 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, > PUSH_DATA (push, 3); /* 7 */ > PUSH_DATA (push, 1); > > + /* MS coordinate adjustment information */ > + for (int i = 1; i <= 8; i *= 2) { > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 4); > + PUSH_DATA (push, 8); > + PUSH_DATA (push, 1); > + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_ADJ_INFO(i)); > + PUSH_DATA (push, address + NVC0_CB_AUX_MS_ADJ_INFO(i)); > + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 3); > + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); > + > + int ms_x = 0, ms_y = 0; > + switch (i) { > + case 1: break; > + case 2: ms_x = 1; break; > + case 4: ms_x = 1; ms_y = 1; break; > + case 8: ms_x = 2; ms_y = 1; break;
Is this really necessary? Couldn't you just adjust the generated code to be like ms_x = (samples+2)>>2 ms_y = samples > 1 And void the constbuf bit entirely? [And yeah, this falls down with a number of samples that isn't 1/2/4/8, but that really shouldn't happen.] -ilia _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev