adds all the required logic for a20x hw binning to work Signed-off-by: Jonathan Marek <jonat...@marek.ca> --- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 95 ++++++++++++---- src/gallium/drivers/freedreno/a2xx/fd2_emit.c | 10 +- src/gallium/drivers/freedreno/a2xx/fd2_emit.h | 3 +- src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 106 +++++++++++++++++- .../drivers/freedreno/a2xx/fd2_program.c | 41 ++++--- .../drivers/freedreno/a2xx/fd2_program.h | 2 +- 6 files changed, 214 insertions(+), 43 deletions(-)
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 6f0535fa2b..1792505808 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -77,31 +77,56 @@ emit_vertexbufs(struct fd_context *ctx) // CONST(20,0) (or CONST(26,0) in soliv_vp) fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); + fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements); } -static bool -fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, - unsigned index_offset) +static void +draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, + struct fd_ringbuffer *ring, unsigned index_offset, + bool binning) { - struct fd_ringbuffer *ring = ctx->batch->draw; - - if (ctx->dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); - - fd2_emit_state(ctx, ctx->dirty); + enum pc_di_vis_cull_mode vismode; OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, info->start); + OUT_RING(ring, info->index_size ? 0 : info->start); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); + /* in the binning batch, thid value is set once in fd2_emit_tile_init */ + if (!binning) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + /* XXX do this for every REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL write ? + * if set to 0x3b on a20x, clipping is broken + */ + OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b); + } OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - if (!is_a20x(ctx->screen)) { + if (is_a20x(ctx->screen)) { + /* wait for DMA to finish and + * dummy draw one triangle with indexes 0,0,0. + * with PRE_FETCH_CULL_ENABLE | GRP_CULL_ENABLE. + * + * this workaround is for a HW bug related to DMA alignment: + * it is necessary for indexed draws and possibly also + * draws that read binning data + */ + OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); + OUT_RING(ring, 0x000005d0); /* RBBM_STATUS */ + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00001000); /* bit: 12: VGT_BUSY_NO_DMA */ + OUT_RING(ring, 0x00000001); + + OUT_PKT3(ring, CP_DRAW_INDX_BIN, 6); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x0003c004); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000003); + OUT_RELOC(ring, fd_resource(fd2_context(ctx)->solid_vertexbuf)->bo, 0x80, 0, 0); + OUT_RING(ring, 0x00000006); + } else { OUT_WFI (ring); OUT_PKT3(ring, CP_SET_CONSTANT, 3); @@ -110,14 +135,44 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */ } + /* C64 holds offset to use for binning data */ + if (binning && is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000180); + OUT_RING(ring, fui(ctx->batch->num_vertices)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + vismode = binning ? IGNORE_VISIBILITY : USE_VISIBILITY; + /* a22x hw binning not implemented */ + if (binning || !is_a20x(ctx->screen) || (fd_mesa_debug & FD_DBG_NOBIN)) + vismode = IGNORE_VISIBILITY; + fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], - IGNORE_VISIBILITY, info, index_offset); + vismode, info, index_offset); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); - OUT_RING(ring, 0x00000000); + /* necessary workaround.. gpu might hang without it */ + if (is_a20x(ctx->screen) && vismode == USE_VISIBILITY) + OUT_WFI(ring); emit_cacheflush(ring); +} + + +static bool +fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, + unsigned index_offset) +{ + if (ctx->dirty & FD_DIRTY_VTXBUF) + emit_vertexbufs(ctx); + + fd2_emit_state(ctx, ctx->batch->draw, ctx->dirty); + fd2_emit_state(ctx, ctx->batch->binning, ctx->dirty); + + draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false); + draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true); fd_context_all_clean(ctx); @@ -138,7 +193,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, colr = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f); /* emit generic state now: */ - fd2_emit_state(ctx, ctx->dirty & + fd2_emit_state(ctx, ring, ctx->dirty & (FD_DIRTY_BLEND | FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR)); @@ -154,7 +209,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); OUT_RING(ring, 0x0000028f); - fd2_program_emit(ring, &ctx->solid_prog); + fd2_program_emit(ctx->batch, ring, &ctx->solid_prog); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index dc212ed413..dcb7b6500a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -181,11 +181,12 @@ fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, } void -fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) +fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + const enum fd_dirty_3d_state dirty) { struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa); - struct fd_ringbuffer *ring = ctx->batch->draw; + bool binning = ring == ctx->batch->binning; /* NOTE: we probably want to eventually refactor this so each state * object handles emitting it's own state.. although the mapping of @@ -223,7 +224,8 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); OUT_RING(ring, rasterizer->pa_cl_clip_cntl); OUT_RING(ring, rasterizer->pa_su_sc_mode_cntl | - A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE); + A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE | + (binning ? A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE : 0)); OUT_PKT3(ring, CP_SET_CONSTANT, 5); OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POINT_SIZE)); @@ -294,7 +296,7 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) { fd2_program_validate(ctx); - fd2_program_emit(ring, &ctx->prog); + fd2_program_emit(ctx->batch, ring, &ctx->prog); } if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h index d908b11351..8be4857ff7 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h @@ -42,7 +42,8 @@ struct fd2_vertex_buf { void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, struct fd2_vertex_buf *vbufs, uint32_t n); -void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty); +void fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + const enum fd_dirty_3d_state dirty); void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring); void fd2_emit_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index 62382995c0..09149345d8 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -126,7 +126,7 @@ fd2_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); OUT_RING(ring, 0x0000028f); - fd2_program_emit(ring, &ctx->solid_prog); + fd2_program_emit(batch, ring, &ctx->solid_prog); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); @@ -267,7 +267,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); OUT_RING(ring, 0x0000003b); - fd2_program_emit(ring, &ctx->blit_prog[0]); + fd2_program_emit(batch, ring, &ctx->blit_prog[0]); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); @@ -348,6 +348,7 @@ fd2_emit_tile_init(struct fd_batch *batch) struct fd_gmem_stateobj *gmem = &ctx->gmem; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); uint32_t reg; + int i; fd2_emit_restore(ctx, ring); @@ -360,6 +361,90 @@ fd2_emit_tile_init(struct fd_batch *batch) if (pfb->zsbuf) reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); OUT_RING(ring, reg); /* RB_DEPTH_INFO */ + + if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) { + /* patch out unneeded memory exports by setting EXEC_END cf */ + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val; + + instr_cf_t *cf = (instr_cf_t*) patch->cs; + if (cf->opc == ALLOC) + cf++; + assert(cf->opc == EXEC); + assert(cf[ctx->screen->num_vsc_pipes*2-2].opc == EXEC_END); + cf[2*(ctx->num_vsc_pipe-1)].opc = EXEC_END; + } + + /* initialize shader constants for the binning memexport */ + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + ctx->num_vsc_pipe * 4); + OUT_RING(ring, 0x0000000C); + + for (i = 0; i < ctx->num_vsc_pipe; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + + /* XXX we know how large this needs to be.. + * should do some sort of realloc + * should be ctx->batch->num_vertices bytes large + * with fixed 256kib it will break with more than 256k vertices + */ + if (!pipe->bo) { + pipe->bo = fd_bo_new(ctx->dev, 0x40000, + DRM_FREEDRENO_GEM_TYPE_KMEM); + } + + /* memory export address (export32): + * .x: (base_address >> 2) | 0x40000000 (?) + * .y: index (float) - set by shader + * .z: 0x4B00D000 (?) + * .w: 0x4B000000 (?) | max_index (?) + */ + OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x4B00D000); + OUT_RING(ring, 0x4B000000 | 0x40000); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + ctx->num_vsc_pipe * 8); + OUT_RING(ring, 0x0000018C); + + for (i = 0; i < ctx->num_vsc_pipe; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + float off_x, off_y, mul_x, mul_y; + + /* const to tranform from [-1,1] to bin coordinates for this pipe + * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc + * 8 possible values on x/y axis, + * to clip at binning stage: only use center 6x6 + * TODO: set the z parameters too so that hw binning + * can clip primitives in Z too + * TODO: how does pointsize fit into this? + */ + + mul_x = 1.0f / (float) (gmem->bin_w * 8); + mul_y = 1.0f / (float) (gmem->bin_h * 8); + off_x = -pipe->x * (1.0/8.0f) + 0.125f; + off_y = -pipe->y * (1.0/8.0f) + 0.125f; + + OUT_RING(ring, fui(off_x * (256.0f/255.0f))); + OUT_RING(ring, fui(off_y * (256.0f/255.0f))); + OUT_RING(ring, 0x3f000000); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(mul_x * (256.0f/255.0f))); + OUT_RING(ring, fui(mul_y * (256.0f/255.0f))); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0); + + ctx->emit_ib(ring, batch->binning); + } + + util_dynarray_resize(&batch->draw_patches, 0); } /* before mem2gmem */ @@ -388,6 +473,7 @@ fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) static void fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) { + struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); @@ -404,6 +490,22 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); + + if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, tile->n); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, tile->n); + + /* TODO only emit this when tile->p changes */ + OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); + } } void diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c index 74b3da5895..4473ea91d5 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c @@ -65,10 +65,10 @@ delete_shader(struct fd2_shader_stateobj *so) } static struct fd2_shader_stateobj * -assemble(struct fd2_shader_stateobj *so) +assemble(struct fd2_shader_stateobj *so, bool a20x_binning) { free(so->bin); - so->bin = ir2_shader_assemble(so->ir, &so->info, 0); + so->bin = ir2_shader_assemble(so->ir, &so->info, a20x_binning); if (!so->bin) goto fail; @@ -117,18 +117,25 @@ fail: } static void -emit(struct fd_ringbuffer *ring, struct fd2_shader_stateobj *so) +emit(struct fd_batch *batch, struct fd_ringbuffer *ring, + struct fd2_shader_stateobj *so) { + bool binning = ring == batch->binning; + bool a20x_binning = binning && is_a20x(batch->ctx->screen); unsigned i; if (so->info.sizedwords == 0) - assemble(so); + assemble(so, a20x_binning); OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + so->info.sizedwords); OUT_RING(ring, (so->type == SHADER_VERTEX) ? 0 : 1); OUT_RING(ring, so->info.sizedwords); - for (i = 0; i < so->info.sizedwords; i++) - OUT_RING(ring, so->bin[i]); + for (i = 0; i < so->info.sizedwords; i++) { + if (a20x_binning && i == so->info.cf_export32) + OUT_RINGP(ring, so->bin[i], &batch->draw_patches); + else + OUT_RING(ring, so->bin[i]); + } } static void * @@ -273,7 +280,7 @@ fd2_program_validate(struct fd_context *ctx) } void -fd2_program_emit(struct fd_ringbuffer *ring, +fd2_program_emit(struct fd_batch *batch, struct fd_ringbuffer *ring, struct fd_program_stateobj *prog) { struct ir2_shader_info *vsi = @@ -281,9 +288,12 @@ fd2_program_emit(struct fd_ringbuffer *ring, struct ir2_shader_info *fsi = &((struct fd2_shader_stateobj *)prog->fp)->info; uint8_t vs_gprs, fs_gprs, vs_export; + bool binning = ring == batch->binning; + bool a20x_binning = binning && is_a20x(batch->ctx->screen); - emit(ring, prog->vp); - emit(ring, prog->fp); + emit(batch, ring, prog->vp); + if (!binning) + emit(batch, ring, prog->fp); vs_gprs = (vsi->max_reg < 0) ? 0x80 : vsi->max_reg; fs_gprs = (fsi->max_reg < 0) ? 0x80 : fsi->max_reg; @@ -292,11 +302,13 @@ fd2_program_emit(struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL)); OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(POSITION_2_VECTORS_SPRITE) | + A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(0) | A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE | A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE | A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) | A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) | - A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs)); + A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) | + (a20x_binning ? A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX : 0)); } /* Creates shader: @@ -328,7 +340,7 @@ create_blit_fp(void) ir2_reg_create(instr, 0, NULL, 0); ir2_reg_create(instr, 0, NULL, 0); - return assemble(so); + return assemble(so, false); } /* Creates shader: @@ -374,7 +386,7 @@ create_blit_vp(void) ir2_reg_create(instr, 1, NULL, 0); ir2_reg_create(instr, 1, NULL, 0); - return assemble(so); + return assemble(so, false); } /* Creates shader: @@ -398,7 +410,7 @@ create_solid_fp(void) ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); - return assemble(so); + return assemble(so, false); } /* Creates shader: @@ -431,8 +443,7 @@ create_solid_vp(void) ir2_reg_create(instr, 1, NULL, 0); ir2_reg_create(instr, 1, NULL, 0); - - return assemble(so); + return assemble(so, false); } void diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.h b/src/gallium/drivers/freedreno/a2xx/fd2_program.h index 170b22abee..7d194f49d0 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.h @@ -72,7 +72,7 @@ struct fd2_shader_stateobj { } immediates[64]; }; -void fd2_program_emit(struct fd_ringbuffer *ring, +void fd2_program_emit(struct fd_batch *batch, struct fd_ringbuffer *ring, struct fd_program_stateobj *prog); void fd2_program_validate(struct fd_context *ctx); -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev