On Fri, Apr 28, 2017 at 9:33 PM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 28.04.2017 17:59, Marek Olšák wrote: >> >> On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle <nhaeh...@gmail.com> >> wrote: >>> >>> On 24.04.2017 10:45, Marek Olšák wrote: >>>> >>>> >>>> From: Marek Olšák <marek.ol...@amd.com> >>>> >>>> --- >>>> src/gallium/drivers/radeonsi/si_shader.c | 87 >>>> +++++++++++++++++++++++++------- >>>> 1 file changed, 70 insertions(+), 17 deletions(-) >>>> >>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >>>> b/src/gallium/drivers/radeonsi/si_shader.c >>>> index a4c2ac0..392f85d 100644 >>>> --- a/src/gallium/drivers/radeonsi/si_shader.c >>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c >>>> @@ -7368,20 +7368,28 @@ static void >>>> si_count_scratch_private_memory(struct >>>> si_shader_context *ctx) >>>> LLVMTypeRef type = >>>> LLVMGetElementType(LLVMTypeOf(inst)); >>>> /* No idea why LLVM aligns allocas to 4 >>>> elements. >>>> */ >>>> unsigned alignment = LLVMGetAlignment(inst); >>>> unsigned dw_size = >>>> align(llvm_get_type_size(type) >>>> / 4, alignment); >>>> ctx->shader->config.private_mem_vgprs += >>>> dw_size; >>>> } >>>> bb = LLVMGetNextBasicBlock(bb); >>>> } >>>> } >>>> >>>> +static void si_init_exec_full_mask(struct si_shader_context *ctx) >>>> +{ >>>> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); >>>> + lp_build_intrinsic(ctx->gallivm.builder, >>>> + "llvm.amdgcn.init.exec", ctx->voidt, >>>> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); >>>> +} >>>> + >>>> static void si_init_exec_from_input(struct si_shader_context *ctx, >>>> unsigned param, unsigned bitoffset) >>>> { >>>> LLVMValueRef args[] = { >>>> LLVMGetParam(ctx->main_fn, param), >>>> LLVMConstInt(ctx->i32, bitoffset, 0), >>>> }; >>>> lp_build_intrinsic(ctx->gallivm.builder, >>>> "llvm.amdgcn.init.exec.from.input", >>>> ctx->voidt, args, 2, >>>> LP_FUNC_ATTR_CONVERGENT); >>>> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct >>>> si_shader >>>> *shader, >>>> key->ps_epilog.states = shader->key.part.ps.epilog; >>>> } >>>> >>>> /** >>>> * Build the GS prolog function. Rotate the input vertices for triangle >>>> strips >>>> * with adjacency. >>>> */ >>>> static void si_build_gs_prolog_function(struct si_shader_context *ctx, >>>> union si_shader_part_key *key) >>>> { >>>> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >>>> - const unsigned num_vgprs = 8; >>>> + unsigned num_sgprs, num_vgprs; >>>> struct gallivm_state *gallivm = &ctx->gallivm; >>>> LLVMBuilderRef builder = gallivm->builder; >>>> - LLVMTypeRef params[32]; >>>> - LLVMTypeRef returns[32]; >>>> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ >>>> + LLVMTypeRef returns[48]; >>>> LLVMValueRef func, ret; >>>> >>>> + if (ctx->screen->b.chip_class >= GFX9) { >>>> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; >>>> + num_vgprs = 5; /* ES inputs are not needed by GS */ >>>> + } else { >>>> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >>>> + num_vgprs = 8; >>>> + } >>>> + >>>> for (unsigned i = 0; i < num_sgprs; ++i) { >>>> params[i] = ctx->i32; >>>> returns[i] = ctx->i32; >>>> } >>>> >>>> for (unsigned i = 0; i < num_vgprs; ++i) { >>>> params[num_sgprs + i] = ctx->i32; >>>> returns[num_sgprs + i] = ctx->f32; >>>> } >>>> >>>> /* Create the function. */ >>>> si_create_function(ctx, "gs_prolog", returns, num_sgprs + >>>> num_vgprs, >>>> params, num_sgprs + num_vgprs, num_sgprs - >>>> 1); >>>> func = ctx->main_fn; >>>> >>>> + /* Set the full EXEC mask for the prolog, because we are only >>>> fiddling >>>> + * with registers here. The main shader part will set the >>>> correct >>>> EXEC >>>> + * mask. >>>> + */ >>>> + if (ctx->screen->b.chip_class >= GFX9) >>>> + si_init_exec_full_mask(ctx); >>>> + >>>> /* Copy inputs to outputs. This should be no-op, as the >>>> registers >>>> match, >>>> * but it will prevent the compiler from overwriting them >>>> unintentionally. >>>> */ >>>> ret = ctx->return_value; >>>> for (unsigned i = 0; i < num_sgprs; i++) { >>>> LLVMValueRef p = LLVMGetParam(func, i); >>>> ret = LLVMBuildInsertValue(builder, ret, p, i, ""); >>>> } >>>> for (unsigned i = 0; i < num_vgprs; i++) { >>>> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); >>>> p = LLVMBuildBitCast(builder, p, ctx->f32, ""); >>>> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + >>>> i, >>>> ""); >>>> } >>>> >>>> if (key->gs_prolog.states.tri_strip_adj_fix) { >>>> /* Remap the input vertices for every other primitive. >>>> */ >>>> - const unsigned vtx_params[6] = { >>>> + const unsigned gfx6_vtx_params[6] = { >>>> num_sgprs, >>>> num_sgprs + 1, >>>> num_sgprs + 3, >>>> num_sgprs + 4, >>>> num_sgprs + 5, >>>> num_sgprs + 6 >>>> }; >>>> + const unsigned gfx9_vtx_params[3] = { >>>> + num_sgprs, >>>> + num_sgprs + 1, >>>> + num_sgprs + 4, >>>> + }; >>>> + LLVMValueRef vtx_in[6], vtx_out[6]; >>>> LLVMValueRef prim_id, rotate; >>>> >>>> + if (ctx->screen->b.chip_class >= GFX9) { >>>> + for (unsigned i = 0; i < 3; i++) { >>>> + vtx_in[i*2] = unpack_param(ctx, >>>> gfx9_vtx_params[i], 0, 16); >>>> + vtx_in[i*2+1] = unpack_param(ctx, >>>> gfx9_vtx_params[i], 16, 16); >>>> + } >>>> + } else { >>>> + for (unsigned i = 0; i < 6; i++) >>>> + vtx_in[i] = LLVMGetParam(func, >>>> gfx6_vtx_params[i]); >>>> + } >>>> + >>>> prim_id = LLVMGetParam(func, num_sgprs + 2); >>>> rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); >>>> >>>> for (unsigned i = 0; i < 6; ++i) { >>>> - LLVMValueRef base, rotated, actual; >>>> - base = LLVMGetParam(func, vtx_params[i]); >>>> - rotated = LLVMGetParam(func, vtx_params[(i + 4) >>>> % >>>> 6]); >>>> - actual = LLVMBuildSelect(builder, rotate, >>>> rotated, >>>> base, ""); >>>> - actual = LLVMBuildBitCast(builder, actual, >>>> ctx->f32, ""); >>>> - ret = LLVMBuildInsertValue(builder, ret, actual, >>>> vtx_params[i], ""); >>>> + LLVMValueRef base, rotated; >>>> + base = vtx_in[i]; >>>> + rotated = vtx_in[(i + 4) % 6]; >>>> + vtx_out[i] = LLVMBuildSelect(builder, rotate, >>>> rotated, base, ""); >>>> + } >>>> + >>>> + if (ctx->screen->b.chip_class >= GFX9) { >>>> + for (unsigned i = 0; i < 3; i++) { >>>> + LLVMValueRef hi, out; >>>> + >>>> + hi = LLVMBuildShl(builder, >>>> vtx_out[i*2+1], >>>> + LLVMConstInt(ctx->i32, >>>> 16, 0), ""); >>>> + out = LLVMBuildOr(builder, vtx_out[i*2], >>>> hi, ""); >>>> + out = LLVMBuildBitCast(builder, out, >>>> ctx->f32, ""); >>>> + ret = LLVMBuildInsertValue(builder, ret, >>>> out, >>>> + >>>> gfx9_vtx_params[i], ""); >>>> + } >>>> + } else { >>>> + for (unsigned i = 0; i < 6; i++) { >>>> + LLVMValueRef out; >>>> + >>>> + out = LLVMBuildBitCast(builder, >>>> vtx_out[i], ctx->f32, ""); >>>> + ret = LLVMBuildInsertValue(builder, ret, >>>> out, >>>> + >>>> gfx6_vtx_params[i], ""); >>>> + } >>>> } >>> >>> >>> >>> I believe this could be simplified quite a bit, since the vertex indices >>> are >>> rotate by a multiple of 2. So there's no need to unpack the bits and pack >>> them again, instead just rotate the 3 input registers by 2 instead of >>> rotating 6 input registers by 4. >>> >>> I'm fine with it if you want to do that in a follow-up patch. >> >> >> To be honest with you, I'm really not into optimizing for a GS >> workaround while hardly any app uses GS. > > > I'm not really worrying about performance here, either. It's about the size > and complexity of the code.
OK gotcha. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev