Previously, we computed dFdy() using the following instruction: add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q }
That had the disadvantage that it computed the same value for all 4 pixels of a 2x2 subspan, which meant that it was less accurate than dFdx(). This patch changes it to the following instruction: add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } Which has comparable accuracy to dFdx(). Unfortunately, for some reason the SIMD16 version of this instruction: add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H } Doesn't seem to work reliably (presumably the hardware designers never validated the combination of align16 mode with compressed instructions), so we unroll it to: add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q } --- I'm sending this patch out as an RFC because it increases instruction count on SIMD16 shaders. I believe it's worth it to get the increased accuracy. Also I believe the cost is minimal, since we're replacing a single add(16) with two add(8)'s, so the total number of instructions dispatched by the EU is the same. But I'd be interested in hearing others' opinions. Also, we shouldn't land this patch until we've come to a resolution on "i965/hsw: compute DDX in a subspan based only on top row". I'll be following up shortly with a piglit test that demonstrates the accuracy improvement. src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 34 +++++++++++++++++++------- src/mesa/drivers/dri/i965/brw_reg.h | 1 + 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 7ce42c4..7cb1f45 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -556,10 +556,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result * for each pair, and vertstride = 2 jumps us 2 elements after processing a - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled - * between each other. We could probably do it like ddx and swizzle the right - * order later, but bail for now and just produce - * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) + * pair. For DDY, we need to use ALIGN16 mode since it's capable of doing the + * appropriate swizzling. */ void fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) @@ -591,18 +589,36 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, - BRW_HORIZONTAL_STRIDE_0, - BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); - struct brw_reg src1 = brw_reg(src.file, src.nr, 2, + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); + struct brw_reg src1 = brw_reg(src.file, src.nr, 0, BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, - BRW_HORIZONTAL_STRIDE_0, - BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + BRW_HORIZONTAL_STRIDE_1, + BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_16); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); if (negate_value) brw_ADD(p, dst, src1, negate(src0)); else brw_ADD(p, dst, src0, negate(src1)); + if (dispatch_width == 16) { + /* For some reason, instruction compression doesn't seem to work + * properly with ALIGN16 mode, so when doing a 16-wide dispatch, just + * manually unroll to two instructions. + */ + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + src0 = sechalf(src0); + src1 = sechalf(src1); + dst = sechalf(dst); + if (negate_value) + brw_ADD(p, dst, src1, negate(src0)); + else + brw_ADD(p, dst, src0, negate(src1)); + } + brw_pop_insn_state(p); } void diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index 6df3366..3ee3543 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -77,6 +77,7 @@ extern "C" { #define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2) #define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3) #define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1) +#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) static inline bool brw_is_single_value_swizzle(int swiz) -- 1.8.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev