These intrinsics have the offset in dwords already computed in the last source, so the change here is basically using that instead of emitting the ir3_SHR to divide the byte-offset by 4.
The improvement in shader stats is dramatic, of up to ~15% in instruction count in some cases. Tested on a5xx. shader-db is unfortunately not very useful here because shaders that use SSBO require GLSL versions that are not supported by freedreno yet. For examples, most Khronos CTS tests under 'dEQP-GLES31.functional.ssbo.*' are helped. A random case: dEQP-GLES31.functional.ssbo.layout.2_level_array.packed.row_major_mat3x2 with current master: ; CL prog 14/1: 1252 instructions, 0 half, 48 full ; 8 const, 8 constlen ; 61 (ss), 43 (sy) with the SSBO dword-offset moved to NIR: ; CL prog 14/1: 1053 instructions, 0 half, 45 full ; 7 const, 7 constlen ; 34 (ss), 73 (sy) The SHR previously emitted for every single SSBO instruction disappears in most cases, and the dword-offset ends up embedded in the STGB instruction as immediate in many cases as well. No regressions observed with relevant CTS and piglit tests. --- src/freedreno/ir3/ir3_compiler_nir.c | 71 +++++++++++++++++++--------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 0e141f03181..c494913f254 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -760,7 +760,12 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, offset, create_immed(b, 0), }, 2); - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + + /* intrinsic->src[2] holds the dword-offset as placed by + * 'ir3_nir_lower_io_offsets' pass. + */ + assert(intr->intrinsic == nir_intrinsic_load_ssbo_ir3); + src1 = ir3_get_src(ctx, &intr->src[2])[0]; ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0, src0, 0, src1, 0); @@ -798,7 +803,13 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) * nir already *= 4: */ src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp); - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + + /* intrinsic->src[3] holds the dword-offset as placed by + * 'ir3_nir_lower_io_offsets' pass. + */ + assert(intr->intrinsic == nir_intrinsic_store_ssbo_ir3); + src1 = ir3_get_src(ctx, &intr->src[3])[0]; + src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){ offset, create_immed(b, 0), @@ -869,40 +880,50 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) * Note that nir already multiplies the offset by four */ src0 = ir3_get_src(ctx, &intr->src[2])[0]; - src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + + /* intrinsic->src[3] holds the dword-offset as placed by + * 'ir3_nir_lower_io_offsets' pass. It doesn't handle + * 'atomic_comp_swap', though, because that intrinsic already used + * all its 4 sources. + */ + if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + else + src1 = ir3_get_src(ctx, &intr->src[3])[0]; + src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){ offset, create_immed(b, 0), }, 2); switch (intr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_add_ir3: atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_imin_ir3: atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); type = TYPE_S32; break; - case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_umin_ir3: atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_imax_ir3: atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); type = TYPE_S32; break; - case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_umax_ir3: atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_and_ir3: atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_or_ir3: atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_xor_ir3: atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; - case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_exchange_ir3: atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0); break; case nir_intrinsic_ssbo_atomic_comp_swap: @@ -1639,24 +1660,28 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) } } break; - case nir_intrinsic_load_ssbo: + /* All SSBO intrinsics (except atomic_comp_swap) should have been + * lowered by 'lower_io_offsets' pass and replaced by an ir3-specifc + * version that adds the dword-offset in the last component. + */ + case nir_intrinsic_load_ssbo_ir3: emit_intrinsic_load_ssbo(ctx, intr, dst); break; - case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_ssbo_ir3: emit_intrinsic_store_ssbo(ctx, intr); break; case nir_intrinsic_get_buffer_size: emit_intrinsic_ssbo_size(ctx, intr, dst); break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_add_ir3: + case nir_intrinsic_ssbo_atomic_imin_ir3: + case nir_intrinsic_ssbo_atomic_umin_ir3: + case nir_intrinsic_ssbo_atomic_imax_ir3: + case nir_intrinsic_ssbo_atomic_umax_ir3: + case nir_intrinsic_ssbo_atomic_and_ir3: + case nir_intrinsic_ssbo_atomic_or_ir3: + case nir_intrinsic_ssbo_atomic_xor_ir3: + case nir_intrinsic_ssbo_atomic_exchange_ir3: case nir_intrinsic_ssbo_atomic_comp_swap: dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr); break; -- 2.20.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev