On Wed, Oct 2, 2013 at 2:14 AM, Chris Forbes <chr...@ijw.co.nz> wrote: > With those fixes: > > Reviewed-by: Chris Forbes <chr...@ijw.co.nz> Thanks, I will push it shortly.
With this change landed, the slowness of sample_d is no longer the bottleneck. Instead, the lack of native SIMD16 sample_d becomes the problem. I have posted my other series that emulates SIMD16 sample_d with dual SIMD8 sample_d for review. > > On Wed, Oct 2, 2013 at 6:38 AM, Ian Romanick <i...@freedesktop.org> wrote: >> On 09/30/2013 10:54 PM, Chia-I Wu wrote: >>> From: Chia-I Wu <o...@lunarg.com> >> >> I agree with both of Ken's comments. With those fixed, this patch is >> >> Reviewed-by: Ian Romanick <ian.d.roman...@intel.com> >> >>> Consider only the top-left and top-right pixels to approximate DDX in a 2x2 >>> subspan, unless the application requests a more accurate approximation via >>> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the >>> new driconf option disable_derivative_optimization. >>> >>> This results in a less accurate approximation. However, it improves the >>> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at >>> 95.0% >>> confidence) on Haswell. No noticeable image quality difference observed. >>> >>> The improvement comes from faster sample_d. It seems, on Haswell, some >>> optimizations are introduced to allow faster sample_d when all pixels in a >>> subspan have the same derivative. I considered SAMPLE_STATE too, which >>> allows >>> one to control the quality of sample_d on Haswell. But it gave much worse >>> image quality without giving better performance comparing to this change. >>> >>> No piglit quick.tests regression on Haswell, except with in-parameter-struct >>> and normal-parameter-struct tests which appear to be noises. >>> >>> Signed-off-by: Chia-I Wu <o...@lunarg.com> >>> --- >>> src/mesa/drivers/dri/i965/brw_context.c | 2 ++ >>> src/mesa/drivers/dri/i965/brw_context.h | 1 + >>> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 >>> +++++++++++++++++++------- >>> src/mesa/drivers/dri/i965/brw_wm.c | 11 +++++++++ >>> src/mesa/drivers/dri/i965/brw_wm.h | 1 + >>> src/mesa/drivers/dri/i965/intel_screen.c | 4 ++++ >>> 6 files changed, 44 insertions(+), 8 deletions(-) >>> >>> diff --git a/src/mesa/drivers/dri/i965/brw_context.c >>> b/src/mesa/drivers/dri/i965/brw_context.c >>> index 5f58a29..18b8e57 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_context.c >>> +++ b/src/mesa/drivers/dri/i965/brw_context.c >>> @@ -478,6 +478,8 @@ brwCreateContext(int api, >>> brw_draw_init( brw ); >>> >>> brw->precompile = driQueryOptionb(&brw->optionCache, >>> "shader_precompile"); >>> + brw->disable_derivative_optimization = >>> + driQueryOptionb(&brw->optionCache, >>> "disable_derivative_optimization"); >>> >>> ctx->Const.ContextFlags = 0; >>> if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0) >>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h >>> b/src/mesa/drivers/dri/i965/brw_context.h >>> index 0f88bad..0ec1218 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_context.h >>> +++ b/src/mesa/drivers/dri/i965/brw_context.h >>> @@ -1005,6 +1005,7 @@ struct brw_context >>> bool always_flush_cache; >>> bool disable_throttling; >>> bool precompile; >>> + bool disable_derivative_optimization; >>> >>> driOptionCache optionCache; >>> /** @} */ >>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >>> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >>> index 7ce42c4..9eb5e17 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp >>> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct >>> brw_reg dst, struct brw_reg src >>> * >>> * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br >>> * >>> - * and we're trying to produce: >>> + * Ideally, we want to produce: >>> * >>> * DDX DDY >>> * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) >>> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct >>> brw_reg dst, struct brw_reg src >>> * >>> * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same >>> result >>> * for each pair, and vertstride = 2 jumps us 2 elements after processing a >>> - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled >>> - * between each other. We could probably do it like ddx and swizzle the >>> right >>> - * order later, but bail for now and just produce >>> + * pair. But the ideal approximation may impose a huge performance cost on >>> + * sample_d. On at least Haswell, sample_d instruction does some >>> + * optimizations if the same LOD is used for all pixels in the subspan. >>> + * >>> + * For DDY, it's harder, as we want to produce the pairs swizzled between >>> each >>> + * other. We could probably do it like ddx and swizzle the right order >>> later, >>> + * but bail for now and just produce >>> * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) >>> */ >>> void >>> fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct >>> brw_reg src) >>> { >>> + unsigned vstride, width; >>> + >>> + if (c->key.high_quality_derivatives) { >>> + /* produce accurate derivatives */ >>> + vstride = BRW_VERTICAL_STRIDE_2; >>> + width = BRW_WIDTH_2; >>> + } >>> + else { >>> + /* replicate the derivative at the top-left pixel to other pixels */ >>> + vstride = BRW_VERTICAL_STRIDE_4; >>> + width = BRW_WIDTH_4; >>> + } >>> + >>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1, >>> BRW_REGISTER_TYPE_F, >>> - BRW_VERTICAL_STRIDE_2, >>> - BRW_WIDTH_2, >>> + vstride, >>> + width, >>> BRW_HORIZONTAL_STRIDE_0, >>> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); >>> struct brw_reg src1 = brw_reg(src.file, src.nr, 0, >>> BRW_REGISTER_TYPE_F, >>> - BRW_VERTICAL_STRIDE_2, >>> - BRW_WIDTH_2, >>> + vstride, >>> + width, >>> BRW_HORIZONTAL_STRIDE_0, >>> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); >>> brw_ADD(p, dst, src0, negate(src1)); >>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c >>> b/src/mesa/drivers/dri/i965/brw_wm.c >>> index 3d7ca2a..5158bcc 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_wm.c >>> +++ b/src/mesa/drivers/dri/i965/brw_wm.c >>> @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context >>> *brw, >>> >>> key->line_aa = line_aa; >>> >>> + /* _NEW_HINT */ >>> + if (brw->disable_derivative_optimization) { >>> + key->high_quality_derivatives = >>> + ctx->Hint.FragmentShaderDerivative != GL_FASTEST; >>> + } >>> + else { >>> + key->high_quality_derivatives = >>> + ctx->Hint.FragmentShaderDerivative == GL_NICEST; >>> + } >>> + >>> if (brw->gen < 6) >>> key->stats_wm = brw->stats_wm; >>> >>> @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = { >>> _NEW_STENCIL | >>> _NEW_POLYGON | >>> _NEW_LINE | >>> + _NEW_HINT | >>> _NEW_LIGHT | >>> _NEW_FRAG_CLAMP | >>> _NEW_BUFFERS | >>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.h >>> b/src/mesa/drivers/dri/i965/brw_wm.h >>> index f7a2c5f..aa786de 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_wm.h >>> +++ b/src/mesa/drivers/dri/i965/brw_wm.h >>> @@ -66,6 +66,7 @@ struct brw_wm_prog_key { >>> GLuint render_to_fbo:1; >>> GLuint clamp_fragment_color:1; >>> GLuint line_aa:2; >>> + GLuint high_quality_derivatives:1; >>> >>> GLushort drawable_height; >>> GLbitfield64 input_slots_valid; >>> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c >>> b/src/mesa/drivers/dri/i965/intel_screen.c >>> index de80a00..cddc8e8 100644 >>> --- a/src/mesa/drivers/dri/i965/intel_screen.c >>> +++ b/src/mesa/drivers/dri/i965/intel_screen.c >>> @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] = >>> DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+") >>> DRI_CONF_OPT_END >>> >>> + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false") >>> + DRI_CONF_DESC(en, "Derivatives with finer granularity by default") >>> + DRI_CONF_OPT_END >>> + >>> DRI_CONF_SECTION_END >>> DRI_CONF_SECTION_QUALITY >>> DRI_CONF_FORCE_S3TC_ENABLE("false") >>> >> -- o...@lunarg.com _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev