On 2/13/19 10:29 PM, Eduardo Lima Mitev wrote: > These intrinsics have the offset in dwords already computed in the last > source, so the change here is basically using that instead of emitting > the ir3_SHR to divide the byte-offset by 4. > > The improvement in shader stats is dramatic, of up to ~15% in > instruction count in some cases. Tested on a5xx. > > shader-db is unfortunately not very useful here because shaders that use > SSBO require GLSL versions that are not supported by freedreno yet. > > For examples, most Khronos CTS tests under 'dEQP-GLES31.functional.ssbo.*' > are helped. > > A random case: > > dEQP-GLES31.functional.ssbo.layout.2_level_array.packed.row_major_mat3x2 > > with current master: > > ; CL prog 14/1: 1252 instructions, 0 half, 48 full > ; 8 const, 8 constlen > ; 61 (ss), 43 (sy) > > with the SSBO dword-offset moved to NIR: > > ; CL prog 14/1: 1053 instructions, 0 half, 45 full > ; 7 const, 7 constlen > ; 34 (ss), 73 (sy) > > The SHR previously emitted for every single SSBO instruction disappears > in most cases, and the dword-offset ends up embedded in the STGB > instruction as immediate in many cases as well. > > No regressions observed with relevant CTS and piglit tests. > --- > src/freedreno/ir3/ir3_compiler_nir.c | 71 +++++++++++++++++++--------- > 1 file changed, 48 insertions(+), 23 deletions(-) > > diff --git a/src/freedreno/ir3/ir3_compiler_nir.c > b/src/freedreno/ir3/ir3_compiler_nir.c > index 0e141f03181..c494913f254 100644 > --- a/src/freedreno/ir3/ir3_compiler_nir.c > +++ b/src/freedreno/ir3/ir3_compiler_nir.c > @@ -760,7 +760,12 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, > nir_intrinsic_instr *intr, > offset, > create_immed(b, 0), > }, 2); > - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); > + > + /* intrinsic->src[2] holds the dword-offset as placed by > + * 'ir3_nir_lower_io_offsets' pass. > + */ > + assert(intr->intrinsic == nir_intrinsic_load_ssbo_ir3);
This should be debug_assert(). Fixed locally. > + src1 = ir3_get_src(ctx, &intr->src[2])[0]; > > ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0, > src0, 0, src1, 0); > @@ -798,7 +803,13 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, > nir_intrinsic_instr *intr) > * nir already *= 4: > */ > src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp); > - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); > + > + /* intrinsic->src[3] holds the dword-offset as placed by > + * 'ir3_nir_lower_io_offsets' pass. > + */ > + assert(intr->intrinsic == nir_intrinsic_store_ssbo_ir3); Same here. > + src1 = ir3_get_src(ctx, &intr->src[3])[0]; > + > src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){ > offset, > create_immed(b, 0), > @@ -869,40 +880,50 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, > nir_intrinsic_instr *intr) > * Note that nir already multiplies the offset by four > */ > src0 = ir3_get_src(ctx, &intr->src[2])[0]; > - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); > + > + /* intrinsic->src[3] holds the dword-offset as placed by > + * 'ir3_nir_lower_io_offsets' pass. It doesn't handle > + * 'atomic_comp_swap', though, because that intrinsic already used > + * all its 4 sources. > + */ > + if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) > + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); > + else > + src1 = ir3_get_src(ctx, &intr->src[3])[0]; > + > src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){ > offset, > create_immed(b, 0), > }, 2); > > switch (intr->intrinsic) { > - case nir_intrinsic_ssbo_atomic_add: > + case nir_intrinsic_ssbo_atomic_add_ir3: > atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > - case nir_intrinsic_ssbo_atomic_imin: > + case nir_intrinsic_ssbo_atomic_imin_ir3: > atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > type = TYPE_S32; > break; > - case nir_intrinsic_ssbo_atomic_umin: > + case nir_intrinsic_ssbo_atomic_umin_ir3: > atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > - case nir_intrinsic_ssbo_atomic_imax: > + case nir_intrinsic_ssbo_atomic_imax_ir3: > atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > type = TYPE_S32; > break; > - case nir_intrinsic_ssbo_atomic_umax: > + case nir_intrinsic_ssbo_atomic_umax_ir3: > atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > - case nir_intrinsic_ssbo_atomic_and: > + case nir_intrinsic_ssbo_atomic_and_ir3: > atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > - case nir_intrinsic_ssbo_atomic_or: > + case nir_intrinsic_ssbo_atomic_or_ir3: > atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); > break; > - case nir_intrinsic_ssbo_atomic_xor: > + case nir_intrinsic_ssbo_atomic_xor_ir3: > atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > - case nir_intrinsic_ssbo_atomic_exchange: > + case nir_intrinsic_ssbo_atomic_exchange_ir3: > atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, > 0); > break; > case nir_intrinsic_ssbo_atomic_comp_swap: > @@ -1639,24 +1660,28 @@ emit_intrinsic(struct ir3_context *ctx, > nir_intrinsic_instr *intr) > } > } > break; > - case nir_intrinsic_load_ssbo: > + /* All SSBO intrinsics (except atomic_comp_swap) should have > been > + * lowered by 'lower_io_offsets' pass and replaced by an > ir3-specifc > + * version that adds the dword-offset in the last component. > + */ > + case nir_intrinsic_load_ssbo_ir3: > emit_intrinsic_load_ssbo(ctx, intr, dst); > break; > - case nir_intrinsic_store_ssbo: > + case nir_intrinsic_store_ssbo_ir3: > emit_intrinsic_store_ssbo(ctx, intr); > break; > case nir_intrinsic_get_buffer_size: > emit_intrinsic_ssbo_size(ctx, intr, dst); > break; > - case nir_intrinsic_ssbo_atomic_add: > - case nir_intrinsic_ssbo_atomic_imin: > - case nir_intrinsic_ssbo_atomic_umin: > - case nir_intrinsic_ssbo_atomic_imax: > - case nir_intrinsic_ssbo_atomic_umax: > - case nir_intrinsic_ssbo_atomic_and: > - case nir_intrinsic_ssbo_atomic_or: > - case nir_intrinsic_ssbo_atomic_xor: > - case nir_intrinsic_ssbo_atomic_exchange: > + case nir_intrinsic_ssbo_atomic_add_ir3: > + case nir_intrinsic_ssbo_atomic_imin_ir3: > + case nir_intrinsic_ssbo_atomic_umin_ir3: > + case nir_intrinsic_ssbo_atomic_imax_ir3: > + case nir_intrinsic_ssbo_atomic_umax_ir3: > + case nir_intrinsic_ssbo_atomic_and_ir3: > + case nir_intrinsic_ssbo_atomic_or_ir3: > + case nir_intrinsic_ssbo_atomic_xor_ir3: > + case nir_intrinsic_ssbo_atomic_exchange_ir3: > case nir_intrinsic_ssbo_atomic_comp_swap: > dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr); > break; > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev