On Sat, Feb 27, 2016 at 9:02 AM, Samuel Pitoiset <samuel.pitoi...@gmail.com> wrote: > The launch descriptor only allows to set up 8 CBs, but OpenGL > requires at least 14 UBOs. To bypass this limitation, we store > the addrs into the driver constbuf and we directly load from > the global memory. > > Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> > --- > .../drivers/nouveau/codegen/nv50_ir_driver.h | 1 + > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 22 +++++++++++++++++++ > src/gallium/drivers/nouveau/nvc0/nvc0_context.h | 6 +++++- > src/gallium/drivers/nouveau/nvc0/nvc0_program.c | 1 + > src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 25 > ++++++++++++++++++++++ > 5 files changed, 54 insertions(+), 1 deletion(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > index 479e426..a66aa67 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h > @@ -183,6 +183,7 @@ struct nv50_ir_prog_info > uint16_t sampleInfoBase; /* base address for sample positions */ > uint8_t msInfoCBSlot; /* cX[] used for multisample info */ > uint16_t msInfoBase; /* base address for multisample info */ > + uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */ > } io; > > /* driver callback to assign input/output locations */ > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index d6dfed3..2928963 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -1997,6 +1997,28 @@ NVC0LoweringPass::visit(Instruction *i) > i->setIndirect(0, 0, ptr); > i->subOp = NV50_IR_SUBOP_LDC_IS; > } > + > + if (targ->getChipset() >= NVISA_GK104_CHIPSET && > + prog->getType() == Program::TYPE_COMPUTE) { > + /* The launch descriptor only allows to set up 8 CBs, but OpenGL > + * requires at least 14 UBOs. To bypass this limitation, we store > + * the addrs into the driver constbuf and we directly load from > the > + * global memory. */ > + if (i->getSrc(0)->reg.fileIndex >= 7) { > + uint32_t addr = prog->driver->io.uboInfoBase; > + uint8_t b = prog->driver->io.resInfoCBSlot; > + > + addr += (i->getSrc(0)->reg.fileIndex % 7) * 0x8;
I think you wanted - 7 here. > + > + Instruction *ld = bld.mkLoad(TYPE_U64, bld.getSSA(8, > FILE_GPR), > + bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, addr), NULL); > + > + bld.mkLoad(i->dType, i->getDef(0), > + bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0), > + ld->getDef(0)); > + bld.remove(i); So... let's say I make a UBO array with indirect block indexing... what do you do? (Hint: this won't work.) More interestingly, what does the blob do? Right now you're totally ignoring the indirect ptr for these >= 7 things. But even if you did it "properly", if I create a ubo block array that spans the "real cb" and "fake cb" boundary... not sure what to do. > + } > + } > } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { > assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); > i->op = OP_VFETCH; > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > index dcb0bda..06c1fc6 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h > @@ -91,7 +91,8 @@ > #define NVC0_BIND_CP_SCREEN 51 > #define NVC0_BIND_CP_QUERY 52 > #define NVC0_BIND_CP_BUF 53 > -#define NVC0_BIND_CP_COUNT 54 > +#define NVC0_BIND_CP_UBO 54 > +#define NVC0_BIND_CP_COUNT 55 > > /* bufctx for other operations */ > #define NVC0_BIND_2D 0 > @@ -116,6 +117,9 @@ > /* 8 sets of 32-buts pairs MS offsets */ > #define NVC0_CB_AUX_MS_INFO 0x100 /* CP */ > #define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4) > +/* 7 sets of 32-bits integer addrs */ > +#define NVC0_CB_AUX_UBO_INFO 0x140 /* CP */ > +#define NVC0_CB_AUX_UBO_SIZE (7 * 2 * 4) > /* 8 sets of 32-bits integer pairs sample offsets */ > #define NVC0_CB_AUX_SAMPLE_INFO 0x180 /* FP */ > #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2) > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > index afb909c..aba0eda 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c > @@ -544,6 +544,7 @@ nvc0_program_translate(struct nvc0_program *prog, > uint16_t chipset, > info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); > info->io.suInfoBase = NVC0_CB_AUX_SUF_INFO(0); > info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO; > + info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO; > } else { > info->io.resInfoCBSlot = 15; > info->io.suInfoBase = 0; /* TODO */ > diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > index 557dbdc..2640e0f 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c > @@ -486,7 +486,9 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, > const uint *grid_layout) > { > const struct nvc0_screen *screen = nvc0->screen; > + struct nouveau_pushbuf *push = nvc0->base.pushbuf; > const struct nvc0_program *cp = nvc0->compprog; > + uint32_t address; > unsigned i; > > nve4_cp_launch_desc_init_default(desc); > @@ -521,6 +523,29 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, > } > nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, > NVC0_CB_AUX_INFO(5), 1 << 10); > + > + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); > + > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); > + PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO); > + PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO); > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); > + PUSH_DATA (push, 7 * 2 * 4); I'd very much advise against hardcoding the 7 here. > + PUSH_DATA (push, 0x1); > + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7 * 2); > + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); > + > + for (; i < NVC0_MAX_PIPE_CONSTBUFS; i++) { > + struct nv04_resource *res = nv04_resource(nvc0->constbuf[5][i].u.buf); > + if (res) { > + PUSH_DATA (push, res->address + nvc0->constbuf[5][i].offset); > + PUSH_DATAh(push, res->address + nvc0->constbuf[5][i].offset); > + BCTX_REFN(nvc0->bufctx_cp, CP_UBO, res, RD); Did I miss the spot where you clear out this bufctx bin? > + } else { > + PUSH_DATA (push, 0); > + PUSH_DATA (push, 0); > + } > + } > } > > static inline struct nve4_cp_launch_desc * > -- > 2.7.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev