NVC0_CB_AUX_BINDLESS_INFO isn't written to on Maxwell+ and it's too small anyway.
With these changes, TXQ is used to determine the number of samples and the coordinate adjustment information looked up in a small array in the driver constant buffer. v2: rework to use TXQ and a small array instead of a larger array with an entry for each texture Signed-off-by: Rhys Perry <pendingchao...@gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 + .../codegen/nv50_ir_lowering_gm107.cpp | 4 +-- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 31 +++++++++++++++++-- .../nouveau/codegen/nv50_ir_lowering_nvc0.h | 3 +- .../nouveau/codegen/nv50_ir_peephole.cpp | 1 + .../drivers/nouveau/nvc0/mme/com9097.mme | 8 ++--- .../drivers/nouveau/nvc0/mme/com9097.mme.h | 8 ++--- .../drivers/nouveau/nvc0/nvc0_context.h | 23 ++++++++------ .../drivers/nouveau/nvc0/nvc0_program.c | 1 + .../drivers/nouveau/nvc0/nvc0_screen.c | 15 +++++++++ .../drivers/nouveau/nvc0/nve4_compute.c | 22 +++++++++++++ 11 files changed, 94 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 7c835ceab8..b3da6fc3cf 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -188,6 +188,7 @@ struct nv50_ir_prog_info uint8_t msInfoCBSlot; /* cX[] used for multisample info */ uint16_t msInfoBase; /* base address for multisample info */ uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */ + uint16_t msAdjInfoBase; /* base address for MS coordinate adjustment info */ } io; /* driver callback to assign input/output locations */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index c7436e2e29..49a5f3b01f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -320,11 +320,11 @@ GM107LoweringPass::handleSUQ(TexInstruction *suq) if (mask & 0x1) bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0), - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless)); + loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless)); if (mask & 0x2) { int d = util_bitcount(mask & 0x1); bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d), - loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless)); + loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless)); } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 176e0cf608..5db29ba799 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1732,6 +1732,33 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless prog->driver->io.suInfoBase); } +inline Value * +NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless) +{ + if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET) + return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless); + + assert(bindless); + + Value *samples = bld.getSSA(); + // This shouldn't be lowered because it's being inserted before the current instruction + TexInstruction *tex = new_TexInstruction(func, OP_TXQ); + tex->tex.target = target; + tex->tex.query = TXQ_TYPE; + tex->tex.mask = 0x4; + tex->tex.r = 0xff; + tex->tex.s = 0x1f; + tex->tex.rIndirectSrc = 0; + tex->setDef(0, samples); + tex->setSrc(0, ind); + tex->setSrc(1, bld.loadImm(NULL, 0)); + bld.insert(tex); + + // XMAD has a higher throughput than SHL and we shouldn't be dealing with >65535 integers here + Value *ptr = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(8), bld.mkImm(0)); + return loadResInfo32(ptr, index * 4, prog->driver->io.msAdjInfoBase); +} + static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c) { switch (su->tex.target.getEnum()) { @@ -1817,8 +1844,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex) Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA(); Value *ind = tex->getIndirectR(); - Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless); - Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless); + Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless); + Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless); bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index 5dbb3e4f00..4136b1ecfe 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -148,7 +148,7 @@ protected: void handlePIXLD(Instruction *); void checkPredicate(Instruction *); - Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless); + Value *loadMsAdjInfo32(TexInstruction::Target targ, uint32_t index, int slot, Value *ind, bool bindless); virtual bool visit(Instruction *); @@ -161,6 +161,7 @@ private: Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base); Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base); Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base); + Value *loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless); Value *loadBufInfo64(Value *ptr, uint32_t off); Value *loadBufLength32(Value *ptr, uint32_t off); Value *loadUboInfo64(Value *ptr, uint32_t off); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index d851cf3c37..f91c502e9e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -317,6 +317,7 @@ IndirectPropagation::visit(BasicBlock *bb) ImmediateValue imm; if (!i->src(s).isIndirect(0)) continue; + insn = i->getIndirect(s, 0)->getInsn(); if (!insn) continue; diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme index 38c2e86843..8ca8f34f9b 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme @@ -255,7 +255,7 @@ dei_draw_again: parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ parm $r4 send $r4 /* index_bias, send start */ maddr 0x18e3 /* CB_POS */ - send 0x1a0 /* 256 + 160 */ + send 0x1e0 /* 256 + 224 */ braz $r2 #dei_end parm $r5 send $r4 /* start_instance, send index_bias */ send $r5 /* send start_instance */ @@ -311,7 +311,7 @@ dai_draw_again: braz $r3 #dai_end parm $r4 send $r4 /* start_instance */ maddr 0x18e3 /* CB_POS */ - send 0x1a0 /* 256 + 160 */ + send 0x1e0 /* 256 + 224 */ send 0x0 /* send 0 as base_vertex */ send $r4 /* send start_instance */ send $r6 /* draw id */ @@ -374,7 +374,7 @@ deic_draw_again: parm $r4 maddr 0x5f7 /* INDEX_BATCH_FIRST, start */ parm $r4 send $r4 /* index_bias, send start */ maddr 0x18e3 /* CB_POS */ - send 0x1a0 /* 256 + 160 */ + send 0x1e0 /* 256 + 224 */ braz $r2 #deic_end parm $r5 send $r4 /* start_instance, send index_bias */ send $r5 /* send start_instance */ @@ -455,7 +455,7 @@ daic_draw_again: braz $r3 #daic_end parm $r4 send $r4 /* start_instance */ maddr 0x18e3 /* CB_POS */ - send 0x1a0 /* 256 + 160 */ + send 0x1e0 /* 256 + 224 */ send 0x0 /* send 0 as base_vertex */ send $r4 /* send start_instance */ send $r6 /* draw id */ diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h index 49c0891114..47c5e6c6e0 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h @@ -140,7 +140,7 @@ uint32_t mme9097_draw_elts_indirect[] = { 0x017dc451, 0x00002431, 0x0638c021, - 0x00680041, + 0x00780041, 0x0004d007, 0x00002531, 0x00002841, @@ -185,7 +185,7 @@ uint32_t mme9097_draw_arrays_indirect[] = { 0x0004d807, 0x00002431, 0x0638c021, - 0x00680041, + 0x00780041, 0x00000041, 0x00002041, 0x00003041, @@ -233,7 +233,7 @@ uint32_t mme9097_draw_elts_indirect_count[] = { 0x017dc451, 0x00002431, 0x0638c021, - 0x00680041, + 0x00780041, 0x0004d007, 0x00002531, 0x00002841, @@ -300,7 +300,7 @@ uint32_t mme9097_draw_arrays_indirect_count[] = { 0x0004d807, 0x00002431, 0x0638c021, - 0x00680041, + 0x00780041, 0x00000041, 0x00002041, 0x00003041, diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 77237a3c0a..1d920c26f5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -122,35 +122,38 @@ /* 8 sets of 32-bits coordinate offsets */ #define NVC0_CB_AUX_MS_INFO 0x0c0 #define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4) +/* 8 sets of 32-bit pairs containing coordinate adjustment information */ +#define NVC0_CB_AUX_MS_ADJ_INFO(i) 0x100 + (i) * 4 * 2 +#define NVC0_CB_AUX_MS_ADJ_SIZE (8 * 2 * 4) /* block/grid size, at 3 32-bits integers each, gridid and work_dim */ -#define NVC0_CB_AUX_GRID_INFO(i) 0x100 + (i) * 4 /* CP */ +#define NVC0_CB_AUX_GRID_INFO(i) 0x140 + (i) * 4 /* CP */ #define NVC0_CB_AUX_GRID_SIZE (8 * 4) /* FB texture handle */ -#define NVC0_CB_AUX_FB_TEX_INFO 0x100 /* FP */ +#define NVC0_CB_AUX_FB_TEX_INFO 0x140 /* FP */ #define NVC0_CB_AUX_FB_TEX_SIZE (4) /* 8 user clip planes, at 4 32-bits floats each */ -#define NVC0_CB_AUX_UCP_INFO 0x120 +#define NVC0_CB_AUX_UCP_INFO 0x160 #define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4) /* 13 ubos, at 4 32-bits integer each */ -#define NVC0_CB_AUX_UBO_INFO(i) 0x120 + (i) * 4 * 4 /* CP */ +#define NVC0_CB_AUX_UBO_INFO(i) 0x160 + (i) * 4 * 4 /* CP */ #define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4) /* 8 sets of 32-bits integer pairs sample offsets */ -#define NVC0_CB_AUX_SAMPLE_INFO 0x1a0 /* FP */ +#define NVC0_CB_AUX_SAMPLE_INFO 0x1e0 /* FP */ /* 256 bytes, though only 64 bytes used before GM200 */ #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 2 * 4 * 4) /* draw parameters (index bais, base instance, drawid) */ -#define NVC0_CB_AUX_DRAW_INFO 0x1a0 /* VP */ +#define NVC0_CB_AUX_DRAW_INFO 0x1e0 /* VP */ /* 32 user buffers, at 4 32-bits integers each */ -#define NVC0_CB_AUX_BUF_INFO(i) 0x2a0 + (i) * 4 * 4 +#define NVC0_CB_AUX_BUF_INFO(i) 0x2e0 + (i) * 4 * 4 #define NVC0_CB_AUX_BUF_SIZE (NVC0_MAX_BUFFERS * 4 * 4) /* 8 surfaces, at 16 32-bits integers each */ -#define NVC0_CB_AUX_SU_INFO(i) 0x4a0 + (i) * 16 * 4 +#define NVC0_CB_AUX_SU_INFO(i) 0x4e0 + (i) * 16 * 4 #define NVC0_CB_AUX_SU_SIZE (NVC0_MAX_IMAGES * 16 * 4) /* 1 64-bits address and 1 32-bits sequence */ -#define NVC0_CB_AUX_MP_INFO 0x6a0 +#define NVC0_CB_AUX_MP_INFO 0x6e0 #define NVC0_CB_AUX_MP_SIZE 3 * 4 /* 512 64-byte blocks for bindless image handles */ -#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4 +#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6f0 + (i) * 16 * 4 #define NVC0_CB_AUX_BINDLESS_SIZE (NVE4_IMG_MAX_HANDLES * 16 * 4) /* 4 32-bits floats for the vertex runout, put at the end */ #define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index 57d98753f4..b3a0954d76 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -600,6 +600,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; + info->io.msAdjInfoBase = NVC0_CB_AUX_MS_ADJ_INFO(0); info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0); if (info->target >= NVISA_GK104_CHIPSET) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 2eecf59ce0..f67e42052e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -1362,6 +1362,21 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, 1); PUSH_DATA (push, 3); /* 7 */ PUSH_DATA (push, 1); + + /* MS coordinate adjustment information */ + for (int i = 1; i <= 8; i *= 2) { + BEGIN_1IC0(push, NVC0_3D(CB_POS), 3); + PUSH_DATA (push, NVC0_CB_AUX_MS_ADJ_INFO(i)); + int ms_x = 0, ms_y = 0; + switch (i) { + case 1: break; + case 2: ms_x = 1; break; + case 4: ms_x = 1; ms_y = 1; break; + case 8: ms_x = 2; ms_y = 1; break; + } + PUSH_DATA(push, ms_x); + PUSH_DATA(push, ms_y); + } } BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1); PUSH_DATA (push, 0); diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index 8aa8d4936f..b7af7ab0d2 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -168,6 +168,28 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 3); /* 7 */ PUSH_DATA (push, 1); + /* MS coordinate adjustment information */ + for (int i = 1; i <= 8; i *= 2) { + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 4); + PUSH_DATA (push, 8); + PUSH_DATA (push, 1); + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_ADJ_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_MS_ADJ_INFO(i)); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 3); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + int ms_x = 0, ms_y = 0; + switch (i) { + case 1: break; + case 2: ms_x = 1; break; + case 4: ms_x = 1; ms_y = 1; break; + case 8: ms_x = 2; ms_y = 1; break; + } + + PUSH_DATA(push, ms_x); + PUSH_DATA(push, ms_y); + } + #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev