These intrinsics have the offset in dwords already computed in the last
source, so the change here is basically using that instead of emitting
the ir3_SHR to divide the byte-offset by 4.

The improvement in shader stats is dramatic, of up to ~15% in
instruction count in some cases. Tested on a5xx.

shader-db is unfortunately not very useful here because shaders that use
SSBO require GLSL versions that are not supported by freedreno yet.

For examples, most Khronos CTS tests under 'dEQP-GLES31.functional.ssbo.*'
are helped.

A random case:

dEQP-GLES31.functional.ssbo.layout.2_level_array.packed.row_major_mat3x2

with current master:

; CL prog 14/1: 1252 instructions, 0 half, 48 full
; 8 const, 8 constlen
; 61 (ss), 43 (sy)

with the SSBO dword-offset moved to NIR:

; CL prog 14/1: 1053 instructions, 0 half, 45 full
; 7 const, 7 constlen
; 34 (ss), 73 (sy)

The SHR previously emitted for every single SSBO instruction disappears
in most cases, and the dword-offset ends up embedded in the STGB
instruction as immediate in many cases as well.

No regressions observed with relevant CTS and piglit tests.
---
 src/freedreno/ir3/ir3_compiler_nir.c | 71 +++++++++++++++++++---------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/freedreno/ir3/ir3_compiler_nir.c 
b/src/freedreno/ir3/ir3_compiler_nir.c
index 0e141f03181..c494913f254 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -760,7 +760,12 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, 
nir_intrinsic_instr *intr,
                offset,
                create_immed(b, 0),
        }, 2);
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+       /* intrinsic->src[2] holds the dword-offset as placed by
+        * 'ir3_nir_lower_io_offsets' pass.
+        */
+       assert(intr->intrinsic == nir_intrinsic_load_ssbo_ir3);
+       src1 = ir3_get_src(ctx, &intr->src[2])[0];
 
        ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
                        src0, 0, src1, 0);
@@ -798,7 +803,13 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, 
nir_intrinsic_instr *intr)
         * nir already *= 4:
         */
        src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+       /* intrinsic->src[3] holds the dword-offset as placed by
+        * 'ir3_nir_lower_io_offsets' pass.
+        */
+       assert(intr->intrinsic == nir_intrinsic_store_ssbo_ir3);
+       src1 = ir3_get_src(ctx, &intr->src[3])[0];
+
        src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
                offset,
                create_immed(b, 0),
@@ -869,40 +880,50 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, 
nir_intrinsic_instr *intr)
         * Note that nir already multiplies the offset by four
         */
        src0 = ir3_get_src(ctx, &intr->src[2])[0];
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+       /* intrinsic->src[3] holds the dword-offset as placed by
+        * 'ir3_nir_lower_io_offsets' pass. It doesn't handle
+        * 'atomic_comp_swap', though, because that intrinsic already used
+        * all its 4 sources.
+        */
+       if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
+               src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+       else
+               src1 = ir3_get_src(ctx, &intr->src[3])[0];
+
        src2 = ir3_create_collect(ctx, (struct ir3_instruction*[]){
                offset,
                create_immed(b, 0),
        }, 2);
 
        switch (intr->intrinsic) {
-       case nir_intrinsic_ssbo_atomic_add:
+       case nir_intrinsic_ssbo_atomic_add_ir3:
                atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
-       case nir_intrinsic_ssbo_atomic_imin:
+       case nir_intrinsic_ssbo_atomic_imin_ir3:
                atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                type = TYPE_S32;
                break;
-       case nir_intrinsic_ssbo_atomic_umin:
+       case nir_intrinsic_ssbo_atomic_umin_ir3:
                atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
-       case nir_intrinsic_ssbo_atomic_imax:
+       case nir_intrinsic_ssbo_atomic_imax_ir3:
                atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                type = TYPE_S32;
                break;
-       case nir_intrinsic_ssbo_atomic_umax:
+       case nir_intrinsic_ssbo_atomic_umax_ir3:
                atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
-       case nir_intrinsic_ssbo_atomic_and:
+       case nir_intrinsic_ssbo_atomic_and_ir3:
                atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
-       case nir_intrinsic_ssbo_atomic_or:
+       case nir_intrinsic_ssbo_atomic_or_ir3:
                atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
                break;
-       case nir_intrinsic_ssbo_atomic_xor:
+       case nir_intrinsic_ssbo_atomic_xor_ir3:
                atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
-       case nir_intrinsic_ssbo_atomic_exchange:
+       case nir_intrinsic_ssbo_atomic_exchange_ir3:
                atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 
0);
                break;
        case nir_intrinsic_ssbo_atomic_comp_swap:
@@ -1639,24 +1660,28 @@ emit_intrinsic(struct ir3_context *ctx, 
nir_intrinsic_instr *intr)
                        }
                }
                break;
-       case nir_intrinsic_load_ssbo:
+               /* All SSBO intrinsics (except atomic_comp_swap) should have 
been
+                * lowered by 'lower_io_offsets' pass and replaced by an 
ir3-specifc
+                * version that adds the dword-offset in the last component.
+                */
+       case nir_intrinsic_load_ssbo_ir3:
                emit_intrinsic_load_ssbo(ctx, intr, dst);
                break;
-       case nir_intrinsic_store_ssbo:
+       case nir_intrinsic_store_ssbo_ir3:
                emit_intrinsic_store_ssbo(ctx, intr);
                break;
        case nir_intrinsic_get_buffer_size:
                emit_intrinsic_ssbo_size(ctx, intr, dst);
                break;
-       case nir_intrinsic_ssbo_atomic_add:
-       case nir_intrinsic_ssbo_atomic_imin:
-       case nir_intrinsic_ssbo_atomic_umin:
-       case nir_intrinsic_ssbo_atomic_imax:
-       case nir_intrinsic_ssbo_atomic_umax:
-       case nir_intrinsic_ssbo_atomic_and:
-       case nir_intrinsic_ssbo_atomic_or:
-       case nir_intrinsic_ssbo_atomic_xor:
-       case nir_intrinsic_ssbo_atomic_exchange:
+       case nir_intrinsic_ssbo_atomic_add_ir3:
+       case nir_intrinsic_ssbo_atomic_imin_ir3:
+       case nir_intrinsic_ssbo_atomic_umin_ir3:
+       case nir_intrinsic_ssbo_atomic_imax_ir3:
+       case nir_intrinsic_ssbo_atomic_umax_ir3:
+       case nir_intrinsic_ssbo_atomic_and_ir3:
+       case nir_intrinsic_ssbo_atomic_or_ir3:
+       case nir_intrinsic_ssbo_atomic_xor_ir3:
+       case nir_intrinsic_ssbo_atomic_exchange_ir3:
        case nir_intrinsic_ssbo_atomic_comp_swap:
                dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
                break;
-- 
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to