On 09/30/2013 10:54 PM, Chia-I Wu wrote: > From: Chia-I Wu <o...@lunarg.com>
I agree with both of Ken's comments. With those fixed, this patch is Reviewed-by: Ian Romanick <ian.d.roman...@intel.com> > Consider only the top-left and top-right pixels to approximate DDX in a 2x2 > subspan, unless the application requests a more accurate approximation via > GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the > new driconf option disable_derivative_optimization. > > This results in a less accurate approximation. However, it improves the > performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0% > confidence) on Haswell. No noticeable image quality difference observed. > > The improvement comes from faster sample_d. It seems, on Haswell, some > optimizations are introduced to allow faster sample_d when all pixels in a > subspan have the same derivative. I considered SAMPLE_STATE too, which allows > one to control the quality of sample_d on Haswell. But it gave much worse > image quality without giving better performance comparing to this change. > > No piglit quick.tests regression on Haswell, except with in-parameter-struct > and normal-parameter-struct tests which appear to be noises. > > Signed-off-by: Chia-I Wu <o...@lunarg.com> > --- > src/mesa/drivers/dri/i965/brw_context.c | 2 ++ > src/mesa/drivers/dri/i965/brw_context.h | 1 + > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 > +++++++++++++++++++------- > src/mesa/drivers/dri/i965/brw_wm.c | 11 +++++++++ > src/mesa/drivers/dri/i965/brw_wm.h | 1 + > src/mesa/drivers/dri/i965/intel_screen.c | 4 ++++ > 6 files changed, 44 insertions(+), 8 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.c > b/src/mesa/drivers/dri/i965/brw_context.c > index 5f58a29..18b8e57 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.c > +++ b/src/mesa/drivers/dri/i965/brw_context.c > @@ -478,6 +478,8 @@ brwCreateContext(int api, > brw_draw_init( brw ); > > brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile"); > + brw->disable_derivative_optimization = > + driQueryOptionb(&brw->optionCache, "disable_derivative_optimization"); > > ctx->Const.ContextFlags = 0; > if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0) > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 0f88bad..0ec1218 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1005,6 +1005,7 @@ struct brw_context > bool always_flush_cache; > bool disable_throttling; > bool precompile; > + bool disable_derivative_optimization; > > driOptionCache optionCache; > /** @} */ > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index 7ce42c4..9eb5e17 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg > dst, struct brw_reg src > * > * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br > * > - * and we're trying to produce: > + * Ideally, we want to produce: > * > * DDX DDY > * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) > @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct > brw_reg dst, struct brw_reg src > * > * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result > * for each pair, and vertstride = 2 jumps us 2 elements after processing a > - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled > - * between each other. We could probably do it like ddx and swizzle the > right > - * order later, but bail for now and just produce > + * pair. But the ideal approximation may impose a huge performance cost on > + * sample_d. On at least Haswell, sample_d instruction does some > + * optimizations if the same LOD is used for all pixels in the subspan. > + * > + * For DDY, it's harder, as we want to produce the pairs swizzled between > each > + * other. We could probably do it like ddx and swizzle the right order > later, > + * but bail for now and just produce > * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) > */ > void > fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg > src) > { > + unsigned vstride, width; > + > + if (c->key.high_quality_derivatives) { > + /* produce accurate derivatives */ > + vstride = BRW_VERTICAL_STRIDE_2; > + width = BRW_WIDTH_2; > + } > + else { > + /* replicate the derivative at the top-left pixel to other pixels */ > + vstride = BRW_VERTICAL_STRIDE_4; > + width = BRW_WIDTH_4; > + } > + > struct brw_reg src0 = brw_reg(src.file, src.nr, 1, > BRW_REGISTER_TYPE_F, > - BRW_VERTICAL_STRIDE_2, > - BRW_WIDTH_2, > + vstride, > + width, > BRW_HORIZONTAL_STRIDE_0, > BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > struct brw_reg src1 = brw_reg(src.file, src.nr, 0, > BRW_REGISTER_TYPE_F, > - BRW_VERTICAL_STRIDE_2, > - BRW_WIDTH_2, > + vstride, > + width, > BRW_HORIZONTAL_STRIDE_0, > BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); > brw_ADD(p, dst, src0, negate(src1)); > diff --git a/src/mesa/drivers/dri/i965/brw_wm.c > b/src/mesa/drivers/dri/i965/brw_wm.c > index 3d7ca2a..5158bcc 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm.c > +++ b/src/mesa/drivers/dri/i965/brw_wm.c > @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context *brw, > > key->line_aa = line_aa; > > + /* _NEW_HINT */ > + if (brw->disable_derivative_optimization) { > + key->high_quality_derivatives = > + ctx->Hint.FragmentShaderDerivative != GL_FASTEST; > + } > + else { > + key->high_quality_derivatives = > + ctx->Hint.FragmentShaderDerivative == GL_NICEST; > + } > + > if (brw->gen < 6) > key->stats_wm = brw->stats_wm; > > @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = { > _NEW_STENCIL | > _NEW_POLYGON | > _NEW_LINE | > + _NEW_HINT | > _NEW_LIGHT | > _NEW_FRAG_CLAMP | > _NEW_BUFFERS | > diff --git a/src/mesa/drivers/dri/i965/brw_wm.h > b/src/mesa/drivers/dri/i965/brw_wm.h > index f7a2c5f..aa786de 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm.h > +++ b/src/mesa/drivers/dri/i965/brw_wm.h > @@ -66,6 +66,7 @@ struct brw_wm_prog_key { > GLuint render_to_fbo:1; > GLuint clamp_fragment_color:1; > GLuint line_aa:2; > + GLuint high_quality_derivatives:1; > > GLushort drawable_height; > GLbitfield64 input_slots_valid; > diff --git a/src/mesa/drivers/dri/i965/intel_screen.c > b/src/mesa/drivers/dri/i965/intel_screen.c > index de80a00..cddc8e8 100644 > --- a/src/mesa/drivers/dri/i965/intel_screen.c > +++ b/src/mesa/drivers/dri/i965/intel_screen.c > @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] = > DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+") > DRI_CONF_OPT_END > > + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false") > + DRI_CONF_DESC(en, "Derivatives with finer granularity by default") > + DRI_CONF_OPT_END > + > DRI_CONF_SECTION_END > DRI_CONF_SECTION_QUALITY > DRI_CONF_FORCE_S3TC_ENABLE("false") > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev