On Wed, Jul 29, 2015 at 1:20 AM, Dave Airlie <airl...@gmail.com> wrote: > From: Dave Airlie <airl...@redhat.com> > > This is the final piece for ARB_gpu_shader5, > > The code is based on the r600 code from Glenn Kennard, > and myself. > > While developing this, I'm not 100% sure of all the calculations > made in the GS registers, this is why the max_stream is worked > out there and used to limit the changes in registers. Otherwise > my initial attempts either regressed GS texelFetch tests > or primitive-id-restart. The current code has no regressions > in piglit. > > This commit doesn't enable ARB_gpu_shader5, since that just > bumps the glsl level to 4.00, so I'll just do a separate patch > for 4.10. > > v1.1: fix bug introduced in rebase. > > Signed-off-by: Dave Airlie <airl...@redhat.com> > --- > src/gallium/drivers/radeonsi/si_descriptors.c | 4 +- > src/gallium/drivers/radeonsi/si_pipe.c | 2 +- > src/gallium/drivers/radeonsi/si_shader.c | 59 ++++++++++++++++--- > src/gallium/drivers/radeonsi/si_state.c | 4 -- > src/gallium/drivers/radeonsi/si_state.h | 8 ++- > src/gallium/drivers/radeonsi/si_state_shaders.c | 75 > +++++++++++++++++++------ > 6 files changed, 120 insertions(+), 32 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c > b/src/gallium/drivers/radeonsi/si_descriptors.c > index 2e2a35b..14bb6e1 100644 > --- a/src/gallium/drivers/radeonsi/si_descriptors.c > +++ b/src/gallium/drivers/radeonsi/si_descriptors.c > @@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > struct pipe_resource *buffer, > unsigned stride, unsigned num_records, > bool add_tid, bool swizzle, > - unsigned element_size, unsigned index_stride) > + unsigned element_size, unsigned index_stride, > uint64_t offset) > { > struct si_context *sctx = (struct si_context *)ctx; > struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; > @@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > if (buffer) { > uint64_t va; > > - va = r600_resource(buffer)->gpu_address; > + va = r600_resource(buffer)->gpu_address + offset; > > switch (element_size) { > default: > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > index 808b9bc..a120282 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.c > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > @@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum > pipe_cap param) > case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: > return 4095; > case PIPE_CAP_MAX_VERTEX_STREAMS: > - return 1; > + return 4; > > case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: > return 2048; > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index fa31f73..b472fa6 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -31,6 +31,7 @@ > #include "gallivm/lp_bld_intr.h" > #include "gallivm/lp_bld_logic.h" > #include "gallivm/lp_bld_arit.h" > +#include "gallivm/lp_bld_bitarit.h" > #include "gallivm/lp_bld_flow.h" > #include "radeon/r600_cs.h" > #include "radeon/radeon_llvm.h" > @@ -1576,6 +1577,8 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > LLVMValueRef can_emit = > LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); > > + LLVMValueRef stream_id = > + unpack_param(shader, shader->param_streamout_config, 24, 2);
Wrong indentation and missing an empty line before the following comment. > /* Emit the streamout code conditionally. This actually avoids > * out-of-bounds buffer access. The hw tells us via the SGPR > * (so_vtx_count) which threads are allowed to emit streamout data. */ > @@ -1615,8 +1618,9 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > unsigned reg = so->output[i].register_index; > unsigned start = so->output[i].start_component; > unsigned num_comps = so->output[i].num_components; > + unsigned stream = so->output[i].stream; > LLVMValueRef out[4]; > - > + struct lp_build_if_state if_ctx_stream; There should be an empty line after the declaration. > assert(num_comps && num_comps <= 4); > if (!num_comps || num_comps > 4) > continue; > @@ -1649,11 +1653,15 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > break; > } > > + LLVMValueRef can_emit_stream = > + LLVMBuildICmp(builder, LLVMIntEQ, stream_id, > lp_build_const_int32(gallivm, stream), ""); Wrong indentation. > + lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); > build_tbuffer_store_dwords(shader, > shader->so_buffers[buf_idx], > vdata, num_comps, > so_write_offset[buf_idx], > LLVMConstInt(i32, 0, 0), > > so->output[i].dst_offset*4); > + lp_build_endif(&if_ctx_stream); > } > } > lp_build_endif(&if_ctx); > @@ -3188,6 +3196,22 @@ static void build_interp_intrinsic(const struct > lp_build_tgsi_action *action, > } > } > > +static LLVMValueRef si_llvm_get_stream(struct lp_build_tgsi_context > *bld_base, > + struct lp_build_emit_data *emit_data) > +{ > + struct lp_build_context *uint = &bld_base->uint_bld; > + struct gallivm_state *gallivm = bld_base->base.gallivm; > + LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; > + LLVMValueRef stream; > + struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; > + > + assert(src0.File == TGSI_FILE_IMMEDIATE); > + > + stream = imms[src0.Index][src0.SwizzleX]; > + stream = lp_build_and(uint, stream, lp_build_const_int32(gallivm, 3)); > + return stream; You can use LLVMConstIntGetZExtValue to evaluate the constant LLVMValueRef and return unsigned. You can use that result as the index into sctx->gsvs_ring, which can be an array of 4 descriptors. With that, you don't have to use build_indexed_load_const in si_llvm_emit_vertex. gs_next_vertex can also be an array of 4 LLVMValueRef variables, which will eliminate the need to allocate the array in the IR and use LLVMBuildInsert/ExtractElement functions. > +} > + > /* Emit one vertex from the geometry shader */ > static void si_llvm_emit_vertex( > const struct lp_build_tgsi_action *action, > @@ -3202,14 +3226,21 @@ static void si_llvm_emit_vertex( > LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); > LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > SI_PARAM_GS2VS_OFFSET); > + LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > + SI_PARAM_RW_BUFFERS); > + LLVMValueRef gs_next_vertex_array; > LLVMValueRef gs_next_vertex; > LLVMValueRef can_emit, kill; > LLVMValueRef args[2]; > unsigned chan; > int i; > + LLVMValueRef stream; > + LLVMValueRef gsvs_ring; > > + stream = si_llvm_get_stream(bld_base, emit_data); > /* Write vertex attribute values to GSVS ring */ > - gs_next_vertex = LLVMBuildLoad(gallivm->builder, > si_shader_ctx->gs_next_vertex, ""); > + gs_next_vertex_array = LLVMBuildLoad(gallivm->builder, > si_shader_ctx->gs_next_vertex, ""); > + gs_next_vertex = LLVMBuildExtractElement(gallivm->builder, > gs_next_vertex_array, stream, ""); > > /* If this thread has already emitted the declared maximum number of > * vertices, kill it: excessive vertex emissions are not supposed to > @@ -3225,6 +3256,9 @@ static void si_llvm_emit_vertex( > build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", > LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); > > + gsvs_ring = build_indexed_load_const(si_shader_ctx, buf_ptr, > + lp_build_add(uint, stream, > lp_build_const_int32(gallivm, 1))); > + > for (i = 0; i < info->num_outputs; i++) { > LLVMValueRef *out_ptr = > si_shader_ctx->radeon_bld.soa.outputs[i]; > @@ -3241,7 +3275,7 @@ static void si_llvm_emit_vertex( > out_val = LLVMBuildBitCast(gallivm->builder, out_val, > i32, ""); > > build_tbuffer_store(si_shader_ctx, > - si_shader_ctx->gsvs_ring, > + gsvs_ring, > out_val, 1, > voffset, soffset, 0, > V_008F0C_BUF_DATA_FORMAT_32, > @@ -3251,10 +3285,16 @@ static void si_llvm_emit_vertex( > } > gs_next_vertex = lp_build_add(uint, gs_next_vertex, > lp_build_const_int32(gallivm, 1)); > - LLVMBuildStore(gallivm->builder, gs_next_vertex, > si_shader_ctx->gs_next_vertex); > + gs_next_vertex_array = LLVMBuildInsertElement(gallivm->builder, > gs_next_vertex_array, gs_next_vertex, > + stream, ""); > + LLVMBuildStore(gallivm->builder, gs_next_vertex_array, > si_shader_ctx->gs_next_vertex); > + > + /* shift stream value for or'ing */ > + stream = lp_build_shl_imm(uint, stream, 8); > > /* Signal vertex emission */ > args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | > SENDMSG_GS); > + args[0] = lp_build_or(uint, args[0], stream); > args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > SI_PARAM_GS_WAVE_ID); > build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", > LLVMVoidTypeInContext(gallivm->context), args, 2, > @@ -3269,10 +3309,15 @@ static void si_llvm_emit_primitive( > { > struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); > struct gallivm_state *gallivm = bld_base->base.gallivm; > + struct lp_build_context *uint = &bld_base->uint_bld; > LLVMValueRef args[2]; > + LLVMValueRef stream; > > /* Signal primitive cut */ > + stream = si_llvm_get_stream(bld_base, emit_data); > + stream = lp_build_shl_imm(uint, stream, 8); > args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | > SENDMSG_GS); > + args[0] = lp_build_or(uint, args[0], stream); > args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > SI_PARAM_GS_WAVE_ID); > build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", > LLVMVoidTypeInContext(gallivm->context), args, 2, > @@ -3651,8 +3696,7 @@ static void preload_ring_buffers(struct > si_shader_context *si_shader_ctx) > build_indexed_load_const(si_shader_ctx, buf_ptr, > offset); > } > > - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY || > - si_shader_ctx->shader->is_gs_copy_shader) { > + if (si_shader_ctx->shader->is_gs_copy_shader) { > LLVMValueRef offset = lp_build_const_int32(gallivm, > SI_RING_GSVS); > > si_shader_ctx->gsvs_ring = > @@ -4076,9 +4120,10 @@ int si_shader_create(struct si_screen *sscreen, > LLVMTargetMachineRef tm, > preload_ring_buffers(&si_shader_ctx); > > if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { > + /* create a 4xuint32 */ > si_shader_ctx.gs_next_vertex = > lp_build_alloca(bld_base->base.gallivm, > - bld_base->uint_bld.elem_type, ""); > + > lp_build_int_vec_type(bld_base->base.gallivm, lp_type_uint_vec(32, 32*4)), > ""); > } > > if (!lp_build_tgsi_llvm(bld_base, tokens)) { > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > index ab5c3ca..86e1624 100644 > --- a/src/gallium/drivers/radeonsi/si_state.c > +++ b/src/gallium/drivers/radeonsi/si_state.c > @@ -3138,10 +3138,6 @@ static void si_init_config(struct si_context *sctx) > si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0); > si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); > > - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0); > - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0); > - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0); > - > si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); > si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0); > si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); > diff --git a/src/gallium/drivers/radeonsi/si_state.h > b/src/gallium/drivers/radeonsi/si_state.h > index 2522053..132ddda 100644 > --- a/src/gallium/drivers/radeonsi/si_state.h > +++ b/src/gallium/drivers/radeonsi/si_state.h > @@ -147,8 +147,12 @@ struct si_shader_data { > */ > #define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */ > #define SI_RING_ESGS 0 /* for ES, GS */ > +#define SI_RING_ESGS 0 This definition is redundant. > #define SI_RING_GSVS 1 /* for GS, VS */ > -#define SI_NUM_RING_BUFFERS 2 > +#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */ > +#define SI_RING_GSVS_2 3 > +#define SI_RING_GSVS_3 4 > +#define SI_NUM_RING_BUFFERS 5 > #define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS > #define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4) > > @@ -249,7 +253,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > struct pipe_resource *buffer, > unsigned stride, unsigned num_records, > bool add_tid, bool swizzle, > - unsigned element_size, unsigned index_stride); > + unsigned element_size, unsigned index_stride, > uint64_t offset); > void si_init_all_descriptors(struct si_context *sctx); > void si_release_all_descriptors(struct si_context *sctx); > void si_all_descriptors_begin_new_cs(struct si_context *sctx); > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c > b/src/gallium/drivers/radeonsi/si_state_shaders.c > index 18bddfd..daf41f3 100644 > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c > @@ -206,16 +206,32 @@ static void si_shader_es(struct si_shader *shader) > si_set_tesseval_regs(shader, pm4); > } > > +static unsigned si_gs_get_max_stream(struct si_shader *shader) > +{ > + struct pipe_stream_output_info *so = &shader->selector->so; > + unsigned max_stream, i; Empty line after the declaration. > + if (so->num_outputs == 0) > + return 0; > + > + max_stream = 0; > + for (i = 0; i < so->num_outputs; i++) { > + if (so->output[i].stream > max_stream) > + max_stream = so->output[i].stream; > + } > + return max_stream; > +} > + > static void si_shader_gs(struct si_shader *shader) > { > - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 > >> 2); > + unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; > unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; > - unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; > + unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; > unsigned gs_num_invocations = shader->selector->gs_num_invocations; > unsigned cut_mode; > struct si_pm4_state *pm4; > unsigned num_sgprs, num_user_sgprs; > uint64_t va; > + unsigned max_stream = si_gs_get_max_stream(shader); > > /* The GSVS_RING_ITEMSIZE register takes 15 bits */ > assert(gsvs_itemsize < (1 << 15)); > @@ -243,16 +259,19 @@ static void si_shader_gs(struct si_shader *shader) > S_028A40_GS_WRITE_OPTIMIZE(1)); > > si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); > - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize); > - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize); > + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * > ((max_stream >= 2) ? 2 : 1)); > + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * > ((max_stream >= 3) ? 3 : 1)); > > si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, > util_bitcount64(shader->selector->inputs_read) * (16 > >> 2)); > - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize); > + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * > (max_stream + 1)); > > si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); > > - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize); > + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize > >> 2); > + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= > 1) ? gs_vert_itemsize >> 2 : 0); > + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= > 2) ? gs_vert_itemsize >> 2 : 0); > + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= > 3) ? gs_vert_itemsize >> 2 : 0); > > si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, > S_028B90_CNT(MIN2(gs_num_invocations, 127)) | > @@ -1001,15 +1020,42 @@ static void si_init_gs_rings(struct si_context *sctx) > > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, > sctx->esgs_ring, 0, esgs_ring_size, > - true, true, 4, 64); > + true, true, 4, 64, 0); > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, > sctx->esgs_ring, 0, esgs_ring_size, > - false, false, 0, 0); > + false, false, 0, 0, 0); > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, > sctx->gsvs_ring, 0, gsvs_ring_size, > - false, false, 0, 0); > + false, false, 0, 0, 0); > } > > +static void si_update_gs_rings(struct si_context *sctx) > +{ > + unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16; > + unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices; > + unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; > + uint64_t offset; > + > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, 0); > + > + offset = gsvs_itemsize * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > + offset = (gsvs_itemsize * 2) * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > + offset = (gsvs_itemsize * 3) * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > +} > /** > * @returns 1 if \p sel has been updated to use a new scratch buffer and 0 > * otherwise. > @@ -1171,7 +1217,7 @@ static void si_init_tess_factor_ring(struct si_context > *sctx) > > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL, > SI_RING_TESS_FACTOR, sctx->tf_ring, 0, > - sctx->tf_ring->width0, false, false, 0, 0); > + sctx->tf_ring->width0, false, false, 0, 0, 0); > > sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; > } > @@ -1252,7 +1298,7 @@ static void si_update_so(struct si_context *sctx, > struct si_shader_selector *sha > int i; > > for (i = 0; i < so->num_outputs; i++) > - enabled_stream_buffers_mask |= (1 << > so->output[i].output_buffer); > + enabled_stream_buffers_mask |= (1 << > so->output[i].output_buffer) << so->output[i].stream * 4; Missing parentheses? (so->output[i].stream * 4) Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev