[Mesa-dev] [PATCH 5/5] radeonsi: allow out-of-order rasterization in commutative blending cases

Nicolai Hähnle Sat, 09 Sep 2017 03:45:00 -0700

From: Nicolai HÃ¤hnle <nicolai.haeh...@amd.com>

We do not enable this by default for additive blending, since it slightly
breaks OpenGL invariance guarantees due to non-determinism.


Still, there may be some applications can benefit from white-listing
via the radeonsi_commutative_blend_add drirc setting without any real
visible artifacts.
---
 src/gallium/drivers/radeonsi/driinfo_radeonsi.h |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c          |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
 src/gallium/drivers/radeonsi/si_state.c         | 67 +++++++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_state.h         |  1 +
 src/util/xmlpool/t_options.h                    |  5 ++
 6 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h 
b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index 8be85289a0c..989e5175cc0 100644
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -1,5 +1,6 @@
 // DriConf options specific to radeonsi
 DRI_CONF_SECTION_PERFORMANCE
     DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
     DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
+    DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
 DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index b4972be739c..c44ea3be740 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1043,20 +1043,22 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
                (sscreen->b.chip_class == SI &&
                 sscreen->b.info.pfp_fw_version >= 79 &&
                 sscreen->b.info.me_fw_version >= 142);
 
        sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
        sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI &&
                                         sscreen->b.info.max_se >= 2 &&
                                         !(sscreen->b.debug_flags & 
DBG_NO_OUT_OF_ORDER);
        sscreen->assume_no_z_fights =
                driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
+       sscreen->commutative_blend_add =
+               driQueryOptionb(config->options, 
"radeonsi_commutative_blend_add");
        sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 
&&
                                            sscreen->b.family <= 
CHIP_POLARIS12) ||
                                           sscreen->b.family == CHIP_VEGA10 ||
                                           sscreen->b.family == CHIP_RAVEN;
        sscreen->dpbb_allowed = sscreen->b.chip_class >= GFX9 &&
                                !(sscreen->b.debug_flags & DBG_NO_DPBB);
        sscreen->dfsm_allowed = sscreen->dpbb_allowed &&
                                !(sscreen->b.debug_flags & DBG_NO_DFSM);
 
        /* While it would be nice not to have this flag, we are constrained
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index d200c9f571f..27e2bc81172 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -90,20 +90,21 @@ struct u_suballocator;
 struct si_screen {
        struct r600_common_screen       b;
        unsigned                        gs_table_depth;
        unsigned                        tess_offchip_block_dw_size;
        bool                            has_clear_state;
        bool                            has_distributed_tess;
        bool                            has_draw_indirect_multi;
        bool                            has_ds_bpermute;
        bool                            has_out_of_order_rast;
        bool                            assume_no_z_fights;
+       bool                            commutative_blend_add;
        bool                            has_msaa_sample_loc_bug;
        bool                            dpbb_allowed;
        bool                            dfsm_allowed;
        bool                            llvm_has_working_vgpr_indexing;
 
        /* Whether shaders are monolithic (1-part) or separate (3-part). */
        bool                            use_monolithic_shaders;
        bool                            record_llvm_ir;
 
        mtx_t                   shader_parts_mutex;
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index a8af5752771..6a063e7f7a6 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -370,20 +370,62 @@ static uint32_t si_translate_blend_opt_factor(int 
blend_fact, bool is_alpha)
        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
                return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
                return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
                                : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
        default:
                return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
        }
 }
 
+static void si_blend_check_commutativity(struct si_screen *sscreen,
+                                        struct si_state_blend *blend,
+                                        enum pipe_blend_func func,
+                                        enum pipe_blendfactor src,
+                                        enum pipe_blendfactor dst,
+                                        unsigned chanmask)
+{
+       /* Src factor is allowed when it does not depend on Dst */
+       static const uint32_t src_allowed =
+               (1u << PIPE_BLENDFACTOR_ONE) |
+               (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
+               (1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
+               (1u << PIPE_BLENDFACTOR_CONST_COLOR) |
+               (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
+               (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_ZERO) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
+
+       if (dst == PIPE_BLENDFACTOR_ONE &&
+           (src_allowed & (1u << src))) {
+               /* Addition is commutative, but floating point addition isn't
+                * associative: subtle changes can be introduced via different
+                * rounding.
+                *
+                * Out-of-order is also non-deterministic, which means that
+                * this breaks OpenGL invariance requirements. So only enable
+                * out-of-order additive blending if explicitly allowed by a
+                * setting.
+                */
+               if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
+                   (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
+                       blend->commutative_4bit |= chanmask;
+       }
+}
+
 /**
  * Get rid of DST in the blend factors by commuting the operands:
  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
  */
 static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
                                unsigned *dst_factor, unsigned expected_dst,
                                unsigned replacement_src)
 {
        if (*src_factor == expected_dst &&
            *dst_factor == PIPE_BLENDFACTOR_ZERO) {
@@ -486,20 +528,25 @@ static void *si_create_blend_state_mode(struct 
pipe_context *ctx,
                /* cb_render_state will disable unused ones */
                blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 
* i);
                if (state->rt[j].colormask)
                        blend->cb_target_enabled_4bit |= 0xf << (4 * i);
 
                if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
                        si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, 
blend_cntl);
                        continue;
                }
 
+               si_blend_check_commutativity(sctx->screen, blend,
+                                            eqRGB, srcRGB, dstRGB, 0x7 << (4 * 
i));
+               si_blend_check_commutativity(sctx->screen, blend,
+                                            eqA, srcA, dstA, 0x8 << (4 * i));
+
                /* Blending optimizations for RB+.
                 * These transformations don't change the behavior.
                 *
                 * First, get rid of DST in the blend factors:
                 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
                 */
                si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
                                    PIPE_BLENDFACTOR_DST_COLOR,
                                    PIPE_BLENDFACTOR_SRC_COLOR);
                si_blend_remove_dst(&eqA, &srcA, &dstA,
@@ -629,20 +676,21 @@ static void si_bind_blend_state(struct pipe_context *ctx, 
void *state)
            (!old_blend ||
             old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
             old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
             old_blend->cb_target_enabled_4bit != 
blend->cb_target_enabled_4bit))
                si_mark_atom_dirty(sctx, &sctx->dpbb_state);
 
        if (sctx->screen->has_out_of_order_rast &&
            (!old_blend ||
             (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
              old_blend->cb_target_enabled_4bit != 
blend->cb_target_enabled_4bit ||
+             old_blend->commutative_4bit != blend->commutative_4bit ||
              old_blend->logicop_enable != blend->logicop_enable)))
                si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
 }
 
@@ -3193,26 +3241,37 @@ static bool si_out_of_order_rasterization(struct 
si_context *sctx)
                        return false;
 
                if (sctx->b.num_perfect_occlusion_queries != 0 &&
                    !dsa_order_invariant.pass_set)
                        return false;
        }
 
        if (!colormask)
                return true;
 
-       bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0;
+       unsigned blendmask = colormask & blend->blend_enable_4bit;
 
-       if (blend_enabled)
-               return false; /* TODO */
+       if (blendmask) {
+               /* Only commutative blending. */
+               if (blendmask & ~blend->commutative_4bit)
+                       return false;
+
+               if (!dsa_order_invariant.pass_set)
+                       return false;
+       }
+
+       if (colormask & ~blendmask) {
+               if (!dsa_order_invariant.pass_last)
+                       return false;
+       }
 
-       return dsa_order_invariant.pass_last;
+       return true;
 }
 
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom 
*atom)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
        /* 33% faster rendering to linear color buffers */
        bool dst_is_linear = sctx->framebuffer.any_dst_linear;
        bool out_of_order_rast = si_out_of_order_rasterization(sctx);
        unsigned sc_mode_cntl_1 =
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 56e597a5813..5aa50c58932 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -48,20 +48,21 @@ struct si_shader_selector;
 
 struct si_state_blend {
        struct si_pm4_state     pm4;
        uint32_t                cb_target_mask;
        /* Set 0xf or 0x0 (4 bits) per render target if the following is
         * true. ANDed with spi_shader_col_format.
         */
        unsigned                cb_target_enabled_4bit;
        unsigned                blend_enable_4bit;
        unsigned                need_src_alpha_4bit;
+       unsigned                commutative_4bit;
        bool                    alpha_to_coverage:1;
        bool                    alpha_to_one:1;
        bool                    dual_src_blend:1;
        bool                    logicop_enable:1;
 };
 
 struct si_state_rasterizer {
        struct si_pm4_state     pm4;
        /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
        struct si_pm4_state     *pm4_poly_offset;
diff --git a/src/util/xmlpool/t_options.h b/src/util/xmlpool/t_options.h
index c92215183a5..214c7c359ee 100644
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -436,10 +436,15 @@ DRI_CONF_OPT_END
 
 #define DRI_CONF_RADEONSI_ENABLE_SISCHED(def) \
 DRI_CONF_OPT_BEGIN_B(radeonsi_enable_sisched, def) \
         DRI_CONF_DESC(en,gettext("Use the LLVM sisched option for shader 
compiles")) \
 DRI_CONF_OPT_END
 
 #define DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS(def) \
 DRI_CONF_OPT_BEGIN_B(radeonsi_assume_no_z_fights, def) \
         DRI_CONF_DESC(en,gettext("Assume no Z fights (enables aggressive 
out-of-order rasterization to improve performance; may cause rendering 
errors)")) \
 DRI_CONF_OPT_END
+
+#define DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD(def) \
+DRI_CONF_OPT_BEGIN_B(radeonsi_commutative_blend_add, def) \
+        DRI_CONF_DESC(en,gettext("Commutative additive blending optimizations 
(may cause rendering errors)")) \
+DRI_CONF_OPT_END
-- 
2.11.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 5/5] radeonsi: allow out-of-order rasterization in commutative blending cases

Reply via email to