Hi Bas, Have you tested piglit on radeonsi with this?
Marek On Sat, Jun 10, 2017 at 10:05 PM, Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> wrote: > Slightly faster than bpermute, and seems supported since at least > LLVM 3.9. > > v2: Since this supersedes bpermute, remove the bpermute code. > Signed-off-by: Bas Nieuwenhuizen <ba...@google.com> > --- > src/amd/common/ac_llvm_build.c | 47 > ++++++++++++++++++++------------ > src/amd/common/ac_llvm_build.h | 2 +- > src/amd/common/ac_nir_to_llvm.c | 8 +++--- > src/gallium/drivers/radeonsi/si_pipe.c | 2 +- > src/gallium/drivers/radeonsi/si_pipe.h | 2 +- > src/gallium/drivers/radeonsi/si_shader.c | 4 +-- > 6 files changed, 38 insertions(+), 27 deletions(-) > > diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c > index 237e9291d41..99d41bf52d6 100644 > --- a/src/amd/common/ac_llvm_build.c > +++ b/src/amd/common/ac_llvm_build.c > @@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx) > */ > LLVMValueRef > ac_build_ddxy(struct ac_llvm_context *ctx, > - bool has_ds_bpermute, > + bool has_mov_dpp, > uint32_t mask, > int idx, > LLVMValueRef lds, > LLVMValueRef val) > { > - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2]; > + LLVMValueRef thread_id, tl, trbl, args[5]; > LLVMValueRef result; > > - thread_id = ac_get_thread_id(ctx); > + if (has_mov_dpp) { > + uint32_t tl_ctrl = 0, trbl_ctrl = 0; > > - tl_tid = LLVMBuildAnd(ctx->builder, thread_id, > - LLVMConstInt(ctx->i32, mask, false), ""); > - > - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, > - LLVMConstInt(ctx->i32, idx, false), ""); > + for (unsigned i = 0; i < 4; ++i) { > + tl_ctrl |= (i & mask) << (2 * i); > + trbl_ctrl |= ((i & mask) + idx) << (2 * i); > + } > > - if (has_ds_bpermute) { > - args[0] = LLVMBuildMul(ctx->builder, tl_tid, > - LLVMConstInt(ctx->i32, 4, false), ""); > - args[1] = val; > + args[0] = val; > + args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false); > + args[2] = LLVMConstInt(ctx->i32, 0xf, false); > + args[3] = LLVMConstInt(ctx->i32, 0xf, false); > + args[4] = LLVMConstInt(ctx->i1, 1, false); > tl = ac_build_intrinsic(ctx, > - "llvm.amdgcn.ds.bpermute", ctx->i32, > - args, 2, > + "llvm.amdgcn.mov.dpp.i32", ctx->i32, > + args, 5, > AC_FUNC_ATTR_READNONE | > AC_FUNC_ATTR_CONVERGENT); > > - args[0] = LLVMBuildMul(ctx->builder, trbl_tid, > - LLVMConstInt(ctx->i32, 4, false), ""); > + args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false); > trbl = ac_build_intrinsic(ctx, > - "llvm.amdgcn.ds.bpermute", ctx->i32, > - args, 2, > + "llvm.amdgcn.mov.dpp.i32", ctx->i32, > + args, 5, > AC_FUNC_ATTR_READNONE | > AC_FUNC_ATTR_CONVERGENT); > } else { > + LLVMValueRef tl_tid, trbl_tid; > + > + thread_id = ac_get_thread_id(ctx); > + > + tl_tid = LLVMBuildAnd(ctx->builder, thread_id, > + LLVMConstInt(ctx->i32, mask, false), ""); > + > + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, > + LLVMConstInt(ctx->i32, idx, false), > ""); > + > + > LLVMValueRef store_ptr, load_ptr0, load_ptr1; > > store_ptr = ac_build_gep0(ctx, lds, thread_id); > diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h > index ebb78fbd79b..14260b05018 100644 > --- a/src/amd/common/ac_llvm_build.h > +++ b/src/amd/common/ac_llvm_build.h > @@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx); > > LLVMValueRef > ac_build_ddxy(struct ac_llvm_context *ctx, > - bool has_ds_bpermute, > + bool has_mov_dpp, > uint32_t mask, > int idx, > LLVMValueRef lds, > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c > index 49117d21bd2..2385c60d316 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -164,7 +164,7 @@ struct nir_to_llvm_context { > uint8_t num_output_clips; > uint8_t num_output_culls; > > - bool has_ds_bpermute; > + bool has_mov_dpp; > > bool is_gs_copy_shader; > LLVMValueRef gs_next_vertex; > @@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct > nir_to_llvm_context *ctx, > LLVMValueRef result; > ctx->has_ddxy = true; > > - if (!ctx->lds && !ctx->has_ds_bpermute) > + if (!ctx->lds && !ctx->has_mov_dpp) > ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module, > > LLVMArrayType(ctx->i32, 64), > "ddxy_lds", > LOCAL_ADDR_SPACE); > @@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct > nir_to_llvm_context *ctx, > else > idx = 2; > > - result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute, > + result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp, > mask, idx, ctx->lds, > src0); > return result; > @@ -5858,7 +5858,7 @@ LLVMModuleRef > ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, > ac_llvm_context_init(&ctx.ac, ctx.context); > ctx.ac.module = ctx.module; > > - ctx.has_ds_bpermute = ctx.options->chip_class >= VI; > + ctx.has_mov_dpp = ctx.options->chip_class >= VI; > > memset(shader_info, 0, sizeof(*shader_info)); > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > index cb372267cde..7e83d5e5ac4 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.c > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > @@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct > radeon_winsys *ws) > sscreen->b.info.pfp_fw_version >= 121 && > sscreen->b.info.me_fw_version >= 87); > > - sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI; > + sscreen->has_mov_dpp = sscreen->b.chip_class >= VI; > sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= > CHIP_POLARIS10 && > sscreen->b.family <= > CHIP_POLARIS12) || > sscreen->b.family == CHIP_VEGA10 || > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > index 108929c10c6..ef139fb0cd3 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -79,7 +79,7 @@ struct si_screen { > unsigned tess_offchip_block_dw_size; > bool has_distributed_tess; > bool has_draw_indirect_multi; > - bool has_ds_bpermute; > + bool has_mov_dpp; > bool has_msaa_sample_loc_bug; > > /* Whether shaders are monolithic (1-part) or separate (3-part). */ > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index 2c92269a575..2eed45d79a5 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy( > idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? > 1 : 2; > > val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], > ctx->i32, ""); > - val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute, > + val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp, > mask, idx, ctx->lds, val); > emit_data->output[emit_data->chan] = val; > } > @@ -4454,7 +4454,7 @@ static void create_function(struct si_shader_context > *ctx) > assert(shader->info.num_input_vgprs >= num_prolog_vgprs); > shader->info.num_input_vgprs -= num_prolog_vgprs; > > - if (!ctx->screen->has_ds_bpermute && > + if (!ctx->screen->has_mov_dpp && > bld_base->info && > (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || > bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || > -- > 2.13.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev