From: Dave Airlie <airl...@redhat.com> This is the final piece for ARB_gpu_shader5,
The code is based on the r600 code from Glenn Kennard, and myself. While developing this, I'm not 100% sure of all the calculations made in the GS registers, this is why the max_stream is worked out there and used to limit the changes in registers. Otherwise my initial attempts either regressed GS texelFetch tests or primitive-id-restart. The current code has no regressions in piglit. This commit doesn't enable ARB_gpu_shader5, since that just bumps the glsl level to 4.00, so I'll just do a separate patch for 4.10. v1.1: fix bug introduced in rebase. Signed-off-by: Dave Airlie <airl...@redhat.com> --- src/gallium/drivers/radeonsi/si_descriptors.c | 4 +- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 59 ++++++++++++++++--- src/gallium/drivers/radeonsi/si_state.c | 4 -- src/gallium/drivers/radeonsi/si_state.h | 8 ++- src/gallium/drivers/radeonsi/si_state_shaders.c | 75 +++++++++++++++++++------ 6 files changed, 120 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 2e2a35b..14bb6e1 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_resource *buffer, unsigned stride, unsigned num_records, bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride) + unsigned element_size, unsigned index_stride, uint64_t offset) { struct si_context *sctx = (struct si_context *)ctx; struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; @@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, if (buffer) { uint64_t va; - va = r600_resource(buffer)->gpu_address; + va = r600_resource(buffer)->gpu_address + offset; switch (element_size) { default: diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 808b9bc..a120282 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: return 4095; case PIPE_CAP_MAX_VERTEX_STREAMS: - return 1; + return 4; case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: return 2048; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index fa31f73..b472fa6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -31,6 +31,7 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_bitarit.h" #include "gallivm/lp_bld_flow.h" #include "radeon/r600_cs.h" #include "radeon/radeon_llvm.h" @@ -1576,6 +1577,8 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + LLVMValueRef stream_id = + unpack_param(shader, shader->param_streamout_config, 24, 2); /* Emit the streamout code conditionally. This actually avoids * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ @@ -1615,8 +1618,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, unsigned reg = so->output[i].register_index; unsigned start = so->output[i].start_component; unsigned num_comps = so->output[i].num_components; + unsigned stream = so->output[i].stream; LLVMValueRef out[4]; - + struct lp_build_if_state if_ctx_stream; assert(num_comps && num_comps <= 4); if (!num_comps || num_comps > 4) continue; @@ -1649,11 +1653,15 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, break; } + LLVMValueRef can_emit_stream = + LLVMBuildICmp(builder, LLVMIntEQ, stream_id, lp_build_const_int32(gallivm, stream), ""); + lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx], vdata, num_comps, so_write_offset[buf_idx], LLVMConstInt(i32, 0, 0), so->output[i].dst_offset*4); + lp_build_endif(&if_ctx_stream); } } lp_build_endif(&if_ctx); @@ -3188,6 +3196,22 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, } } +static LLVMValueRef si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct lp_build_context *uint = &bld_base->uint_bld; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; + LLVMValueRef stream; + struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; + + assert(src0.File == TGSI_FILE_IMMEDIATE); + + stream = imms[src0.Index][src0.SwizzleX]; + stream = lp_build_and(uint, stream, lp_build_const_int32(gallivm, 3)); + return stream; +} + /* Emit one vertex from the geometry shader */ static void si_llvm_emit_vertex( const struct lp_build_tgsi_action *action, @@ -3202,14 +3226,21 @@ static void si_llvm_emit_vertex( LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS2VS_OFFSET); + LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + LLVMValueRef gs_next_vertex_array; LLVMValueRef gs_next_vertex; LLVMValueRef can_emit, kill; LLVMValueRef args[2]; unsigned chan; int i; + LLVMValueRef stream; + LLVMValueRef gsvs_ring; + stream = si_llvm_get_stream(bld_base, emit_data); /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, ""); + gs_next_vertex_array = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, ""); + gs_next_vertex = LLVMBuildExtractElement(gallivm->builder, gs_next_vertex_array, stream, ""); /* If this thread has already emitted the declared maximum number of * vertices, kill it: excessive vertex emissions are not supposed to @@ -3225,6 +3256,9 @@ static void si_llvm_emit_vertex( build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); + gsvs_ring = build_indexed_load_const(si_shader_ctx, buf_ptr, + lp_build_add(uint, stream, lp_build_const_int32(gallivm, 1))); + for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i]; @@ -3241,7 +3275,7 @@ static void si_llvm_emit_vertex( out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, ""); build_tbuffer_store(si_shader_ctx, - si_shader_ctx->gsvs_ring, + gsvs_ring, out_val, 1, voffset, soffset, 0, V_008F0C_BUF_DATA_FORMAT_32, @@ -3251,10 +3285,16 @@ static void si_llvm_emit_vertex( } gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); - LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex); + gs_next_vertex_array = LLVMBuildInsertElement(gallivm->builder, gs_next_vertex_array, gs_next_vertex, + stream, ""); + LLVMBuildStore(gallivm->builder, gs_next_vertex_array, si_shader_ctx->gs_next_vertex); + + /* shift stream value for or'ing */ + stream = lp_build_shl_imm(uint, stream, 8); /* Signal vertex emission */ args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS); + args[0] = lp_build_or(uint, args[0], stream); args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", LLVMVoidTypeInContext(gallivm->context), args, 2, @@ -3269,10 +3309,15 @@ static void si_llvm_emit_primitive( { struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_context *uint = &bld_base->uint_bld; LLVMValueRef args[2]; + LLVMValueRef stream; /* Signal primitive cut */ + stream = si_llvm_get_stream(bld_base, emit_data); + stream = lp_build_shl_imm(uint, stream, 8); args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS); + args[0] = lp_build_or(uint, args[0], stream); args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", LLVMVoidTypeInContext(gallivm->context), args, 2, @@ -3651,8 +3696,7 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) build_indexed_load_const(si_shader_ctx, buf_ptr, offset); } - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY || - si_shader_ctx->shader->is_gs_copy_shader) { + if (si_shader_ctx->shader->is_gs_copy_shader) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); si_shader_ctx->gsvs_ring = @@ -4076,9 +4120,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, preload_ring_buffers(&si_shader_ctx); if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { + /* create a 4xuint32 */ si_shader_ctx.gs_next_vertex = lp_build_alloca(bld_base->base.gallivm, - bld_base->uint_bld.elem_type, ""); + lp_build_int_vec_type(bld_base->base.gallivm, lp_type_uint_vec(32, 32*4)), ""); } if (!lp_build_tgsi_llvm(bld_base, tokens)) { diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index ab5c3ca..86e1624 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -3138,10 +3138,6 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0); si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0); - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0); - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0); - si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0); si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2522053..132ddda 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -147,8 +147,12 @@ struct si_shader_data { */ #define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */ #define SI_RING_ESGS 0 /* for ES, GS */ +#define SI_RING_ESGS 0 #define SI_RING_GSVS 1 /* for GS, VS */ -#define SI_NUM_RING_BUFFERS 2 +#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */ +#define SI_RING_GSVS_2 3 +#define SI_RING_GSVS_3 4 +#define SI_NUM_RING_BUFFERS 5 #define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS #define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4) @@ -249,7 +253,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_resource *buffer, unsigned stride, unsigned num_records, bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride); + unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); void si_release_all_descriptors(struct si_context *sctx); void si_all_descriptors_begin_new_cs(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 18bddfd..daf41f3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -206,16 +206,32 @@ static void si_shader_es(struct si_shader *shader) si_set_tesseval_regs(shader, pm4); } +static unsigned si_gs_get_max_stream(struct si_shader *shader) +{ + struct pipe_stream_output_info *so = &shader->selector->so; + unsigned max_stream, i; + if (so->num_outputs == 0) + return 0; + + max_stream = 0; + for (i = 0; i < so->num_outputs; i++) { + if (so->output[i].stream > max_stream) + max_stream = so->output[i].stream; + } + return max_stream; +} + static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2); + unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; - unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; + unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; unsigned gs_num_invocations = shader->selector->gs_num_invocations; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; + unsigned max_stream = si_gs_get_max_stream(shader); /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); @@ -243,16 +259,19 @@ static void si_shader_gs(struct si_shader *shader) S_028A40_GS_WRITE_OPTIMIZE(1)); si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize); - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize); + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, util_bitcount64(shader->selector->inputs_read) * (16 >> 2)); - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize); + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize); + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2); + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0); + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0); + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0); si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, S_028B90_CNT(MIN2(gs_num_invocations, 127)) | @@ -1001,15 +1020,42 @@ static void si_init_gs_rings(struct si_context *sctx) si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, sctx->esgs_ring, 0, esgs_ring_size, - true, true, 4, 64); + true, true, 4, 64, 0); si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, sctx->esgs_ring, 0, esgs_ring_size, - false, false, 0, 0); + false, false, 0, 0, 0); si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, sctx->gsvs_ring, 0, gsvs_ring_size, - false, false, 0, 0); + false, false, 0, 0, 0); } +static void si_update_gs_rings(struct si_context *sctx) +{ + unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16; + unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices; + unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; + uint64_t offset; + + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, 0); + + offset = gsvs_itemsize * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + + offset = (gsvs_itemsize * 2) * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + + offset = (gsvs_itemsize * 3) * 64; + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3, + sctx->gsvs_ring, gsvs_itemsize, + 64, true, true, 4, 16, offset); + +} /** * @returns 1 if \p sel has been updated to use a new scratch buffer and 0 * otherwise. @@ -1171,7 +1217,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL, SI_RING_TESS_FACTOR, sctx->tf_ring, 0, - sctx->tf_ring->width0, false, false, 0, 0); + sctx->tf_ring->width0, false, false, 0, 0, 0); sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; } @@ -1252,7 +1298,7 @@ static void si_update_so(struct si_context *sctx, struct si_shader_selector *sha int i; for (i = 0; i < so->num_outputs; i++) - enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer); + enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask; sctx->b.streamout.stride_in_dw = shader->so.stride; } @@ -1311,15 +1357,12 @@ void si_update_shaders(struct si_context *sctx) if (!sctx->gs_rings) si_init_gs_rings(sctx); + if (sctx->emitted.named.gs_rings != sctx->gs_rings) sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings); - si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, - sctx->gsvs_ring, - sctx->gs_shader->gs_max_out_vertices * - sctx->gs_shader->info.num_outputs * 16, - 64, true, true, 4, 16); + si_update_gs_rings(sctx); } else { si_pm4_bind_state(sctx, gs_rings, NULL); si_pm4_bind_state(sctx, gs, NULL); -- 2.4.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev