Adds suppport for ARB_fragment_shader_interlock. We achieve the interlock and fragment ordering by issuing a memory fence via sendc.
Signed-off-by: Plamena Manolova <plamena.manol...@intel.com> --- docs/features.txt | 2 +- docs/relnotes/17.1.0.html | 1 + src/intel/compiler/brw_eu.h | 4 +++ src/intel/compiler/brw_eu_defines.h | 2 ++ src/intel/compiler/brw_eu_emit.c | 47 ++++++++++++++++++++++++++++ src/intel/compiler/brw_fs_generator.cpp | 4 +++ src/intel/compiler/brw_fs_nir.cpp | 15 +++++++++ src/intel/compiler/brw_shader.cpp | 4 +++ src/mesa/drivers/dri/i965/intel_extensions.c | 5 +++ 9 files changed, 83 insertions(+), 1 deletion(-) diff --git a/docs/features.txt b/docs/features.txt index 5f63632..a6237c0 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -281,7 +281,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve GL_ARB_cl_event not started GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) GL_ARB_ES3_2_compatibility DONE (i965/gen8+) - GL_ARB_fragment_shader_interlock not started + GL_ARB_fragment_shader_interlock DONE (i965) GL_ARB_gl_spirv not started GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) GL_ARB_indirect_parameters DONE (nvc0, radeonsi) diff --git a/docs/relnotes/17.1.0.html b/docs/relnotes/17.1.0.html index e7cfe38..1b2393f 100644 --- a/docs/relnotes/17.1.0.html +++ b/docs/relnotes/17.1.0.html @@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers. <ul> <li>OpenGL 4.2 on i965/ivb</li> +<li>GL_ARB_fragment_shader_interlock on i965</li> <li>GL_ARB_gpu_shader_fp64 on i965/ivybridge</li> <li>GL_ARB_gpu_shader_int64 on i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe</li> <li>GL_ARB_shader_ballot on nvc0, radeonsi</li> diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index f422595..117cfae 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -480,6 +480,10 @@ brw_memory_fence(struct brw_codegen *p, struct brw_reg dst); void +brw_interlock(struct brw_codegen *p, + struct brw_reg dst); + +void brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg dest, struct brw_reg mrf, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 13a70f6..9eb5210 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -444,6 +444,8 @@ enum opcode { */ SHADER_OPCODE_BROADCAST, + SHADER_OPCODE_INTERLOCK, + VEC4_OPCODE_MOV_BYTES, VEC4_OPCODE_PACK_BYTES, VEC4_OPCODE_UNPACK_UNIFORM, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 231d6fd..52adf22 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -3403,6 +3403,53 @@ brw_memory_fence(struct brw_codegen *p, } void +brw_interlock(struct brw_codegen *p, + struct brw_reg dst) +{ + const struct gen_device_info *devinfo = p->devinfo; + const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell; + struct brw_inst *insn; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + dst = vec1(dst); + + /* Set dst as destination for dependency tracking, the MEMORY_FENCE + * message doesn't write anything back. + */ + /* BRW_OPCODE_SENDC is what the interlock actually depends on */ + insn = next_insn(p, BRW_OPCODE_SENDC); + dst = retype(dst, BRW_REGISTER_TYPE_UW); + brw_set_dest(p, insn, dst); + brw_set_src0(p, insn, dst); + /* Issuing a memory fence ensures the ordering of fragments */ + brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, + commit_enable); + + if (devinfo->gen == 7 && !devinfo->is_haswell) { + /* IVB does typed surface access through the render cache, so we need to + * flush it too. Use a different register so both flushes can be + * pipelined by the hardware. + */ + insn = next_insn(p, BRW_OPCODE_SENDC); + brw_set_dest(p, insn, offset(dst, 1)); + brw_set_src0(p, insn, offset(dst, 1)); + brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE, + commit_enable); + + /* Now write the response of the second message into the response of the + * first to trigger a pipeline stall -- This way future render and data + * cache messages will be properly ordered with respect to past data and + * render cache messages. + */ + brw_MOV(p, dst, offset(dst, 1)); + } + + brw_pop_insn_state(p); +} + +void brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg dest, struct brw_reg mrf, diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index a7f95cc..093c12d 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2070,6 +2070,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_memory_fence(p, dst); break; + case SHADER_OPCODE_INTERLOCK: + brw_interlock(p, dst); + break; + case SHADER_OPCODE_FIND_LIVE_CHANNEL: { const struct brw_reg mask = brw_stage_has_packed_dispatch(devinfo, stage, diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 23cd4b7..9d6302c 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4138,6 +4138,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_begin_invocation_interlock_ARB: { + const fs_builder ubld = bld.group(8, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + + ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 * + REG_SIZE; + + break; + } + + case nir_intrinsic_end_invocation_interlock_ARB: { + /* We don't need to do anything here */ + break; + } + default: unreachable("unknown intrinsic"); } diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 304b4ec..8b9e42f 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -290,6 +290,9 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "typed_surface_write_logical"; case SHADER_OPCODE_MEMORY_FENCE: return "memory_fence"; + case SHADER_OPCODE_INTERLOCK: + /* For an interlock we actually issue a memory fence via sendc. */ + return "memory_fence"; case SHADER_OPCODE_LOAD_PAYLOAD: return "load_payload"; @@ -989,6 +992,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_TYPED_SURFACE_WRITE: case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_INTERLOCK: case SHADER_OPCODE_URB_WRITE_SIMD8: case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 0133fa1..830e530 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -297,4 +297,9 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.EXT_texture_compression_s3tc = true; ctx->Extensions.ANGLE_texture_compression_dxt = true; + + if ((ctx->Extensions.ARB_shader_image_load_store || ctx->Version >= 42) && + ctx->Const.GLSLVersion >= 420) { + ctx->Extensions.ARB_fragment_shader_interlock = true; + } } -- 2.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev