With those fixes: Reviewed-by: Chris Forbes <chr...@ijw.co.nz>
On Wed, Oct 2, 2013 at 6:38 AM, Ian Romanick <i...@freedesktop.org> wrote: > On 09/30/2013 10:54 PM, Chia-I Wu wrote: >> From: Chia-I Wu <o...@lunarg.com> > > I agree with both of Ken's comments. With those fixed, this patch is > > Reviewed-by: Ian Romanick <ian.d.roman...@intel.com> > >> Consider only the top-left and top-right pixels to approximate DDX in a 2x2 >> subspan, unless the application requests a more accurate approximation via >> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the >> new driconf option disable_derivative_optimization. >> >> This results in a less accurate approximation. However, it improves the >> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at >> 95.0% >> confidence) on Haswell. No noticeable image quality difference observed. >> >> The improvement comes from faster sample_d. It seems, on Haswell, some >> optimizations are introduced to allow faster sample_d when all pixels in a >> subspan have the same derivative. I considered SAMPLE_STATE too, which >> allows >> one to control the quality of sample_d on Haswell. But it gave much worse >> image quality without giving better performance comparing to this change. >> >> No piglit quick.tests regression on Haswell, except with in-parameter-struct >> and normal-parameter-struct tests which appear to be noises. >> >> Signed-off-by: Chia-I Wu <o...@lunarg.com> >> --- >> src/mesa/drivers/dri/i965/brw_context.c | 2 ++ >> src/mesa/drivers/dri/i965/brw_context.h | 1 + >> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 >> +++++++++++++++++++------- >> src/mesa/drivers/dri/i965/brw_wm.c | 11 +++++++++ >> src/mesa/drivers/dri/i965/brw_wm.h | 1 + >> src/mesa/drivers/dri/i965/intel_screen.c | 4 ++++ >> 6 files changed, 44 insertions(+), 8 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_context.c >> b/src/mesa/drivers/dri/i965/brw_context.c >> index 5f58a29..18b8e57 100644 >> --- a/src/mesa/drivers/dri/i965/brw_context.c >> +++ b/src/mesa/drivers/dri/i965/brw_context.c >> @@ -478,6 +478,8 @@ brwCreateContext(int api, >> brw_draw_init( brw ); >> >> brw->precompile = driQueryOptionb(&brw->optionCache, >> "shader_precompile"); >> + brw->disable_derivative_optimization = >> + driQueryOptionb(&brw->optionCache, "disable_derivative_optimization"); >> >> ctx->Const.ContextFlags = 0; >> if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0) >> diff --git a/src/mesa/drivers/dri/i965/brw_context.h >> b/src/mesa/drivers/dri/i965/brw_context.h >> index 0f88bad..0ec1218 100644 >> --- a/src/mesa/drivers/dri/i965/brw_context.h >> +++ b/src/mesa/drivers/dri/i965/brw_context.h >> @@ -1005,6 +1005,7 @@ struct brw_context >> bool always_flush_cache; >> bool disable_throttling; >> bool precompile; >> + bool disable_derivative_optimization; >> >> driOptionCache optionCache; >> /** @} */ >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >> index 7ce42c4..9eb5e17 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg >> dst, struct brw_reg src >> * >> * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br >> * >> - * and we're trying to produce: >> + * Ideally, we want to produce: >> * >> * DDX DDY >> * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) >> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct >> brw_reg dst, struct brw_reg src >> * >> * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same >> result >> * for each pair, and vertstride = 2 jumps us 2 elements after processing a >> - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled >> - * between each other. We could probably do it like ddx and swizzle the >> right >> - * order later, but bail for now and just produce >> + * pair. But the ideal approximation may impose a huge performance cost on >> + * sample_d. On at least Haswell, sample_d instruction does some >> + * optimizations if the same LOD is used for all pixels in the subspan. >> + * >> + * For DDY, it's harder, as we want to produce the pairs swizzled between >> each >> + * other. We could probably do it like ddx and swizzle the right order >> later, >> + * but bail for now and just produce >> * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) >> */ >> void >> fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct >> brw_reg src) >> { >> + unsigned vstride, width; >> + >> + if (c->key.high_quality_derivatives) { >> + /* produce accurate derivatives */ >> + vstride = BRW_VERTICAL_STRIDE_2; >> + width = BRW_WIDTH_2; >> + } >> + else { >> + /* replicate the derivative at the top-left pixel to other pixels */ >> + vstride = BRW_VERTICAL_STRIDE_4; >> + width = BRW_WIDTH_4; >> + } >> + >> struct brw_reg src0 = brw_reg(src.file, src.nr, 1, >> BRW_REGISTER_TYPE_F, >> - BRW_VERTICAL_STRIDE_2, >> - BRW_WIDTH_2, >> + vstride, >> + width, >> BRW_HORIZONTAL_STRIDE_0, >> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); >> struct brw_reg src1 = brw_reg(src.file, src.nr, 0, >> BRW_REGISTER_TYPE_F, >> - BRW_VERTICAL_STRIDE_2, >> - BRW_WIDTH_2, >> + vstride, >> + width, >> BRW_HORIZONTAL_STRIDE_0, >> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); >> brw_ADD(p, dst, src0, negate(src1)); >> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c >> b/src/mesa/drivers/dri/i965/brw_wm.c >> index 3d7ca2a..5158bcc 100644 >> --- a/src/mesa/drivers/dri/i965/brw_wm.c >> +++ b/src/mesa/drivers/dri/i965/brw_wm.c >> @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context >> *brw, >> >> key->line_aa = line_aa; >> >> + /* _NEW_HINT */ >> + if (brw->disable_derivative_optimization) { >> + key->high_quality_derivatives = >> + ctx->Hint.FragmentShaderDerivative != GL_FASTEST; >> + } >> + else { >> + key->high_quality_derivatives = >> + ctx->Hint.FragmentShaderDerivative == GL_NICEST; >> + } >> + >> if (brw->gen < 6) >> key->stats_wm = brw->stats_wm; >> >> @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = { >> _NEW_STENCIL | >> _NEW_POLYGON | >> _NEW_LINE | >> + _NEW_HINT | >> _NEW_LIGHT | >> _NEW_FRAG_CLAMP | >> _NEW_BUFFERS | >> diff --git a/src/mesa/drivers/dri/i965/brw_wm.h >> b/src/mesa/drivers/dri/i965/brw_wm.h >> index f7a2c5f..aa786de 100644 >> --- a/src/mesa/drivers/dri/i965/brw_wm.h >> +++ b/src/mesa/drivers/dri/i965/brw_wm.h >> @@ -66,6 +66,7 @@ struct brw_wm_prog_key { >> GLuint render_to_fbo:1; >> GLuint clamp_fragment_color:1; >> GLuint line_aa:2; >> + GLuint high_quality_derivatives:1; >> >> GLushort drawable_height; >> GLbitfield64 input_slots_valid; >> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c >> b/src/mesa/drivers/dri/i965/intel_screen.c >> index de80a00..cddc8e8 100644 >> --- a/src/mesa/drivers/dri/i965/intel_screen.c >> +++ b/src/mesa/drivers/dri/i965/intel_screen.c >> @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] = >> DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+") >> DRI_CONF_OPT_END >> >> + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false") >> + DRI_CONF_DESC(en, "Derivatives with finer granularity by default") >> + DRI_CONF_OPT_END >> + >> DRI_CONF_SECTION_END >> DRI_CONF_SECTION_QUALITY >> DRI_CONF_FORCE_S3TC_ENABLE("false") >> > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev