On Wed, May 29, 2013 at 10:06 AM, Anuj Phogat <anuj.pho...@gmail.com> wrote: > On Fri, May 24, 2013 at 12:01 PM, Paul Berry <stereotype...@gmail.com> wrote: >> On 16 May 2013 11:44, Anuj Phogat <anuj.pho...@gmail.com> wrote: >>> >>> In traditional multisampled framebuffer rendering, color samples must be >>> explicitly resolved via BlitFramebuffer before doing the scaled blitting >>> of the framebuffer. So, scaled blitting of a multisample framebuffer >>> takes two separate calls to BlitFramebuffer. >>> >>> This patch implements the functionality of doing multisampled scaled >>> resolve using just one BlitFramebuffer call. Important changes involved >>> in this patch are listed below: >>> - Use float registers to scale and offset texture coordinates. >>> - Change offset computation to consider float coordinates. >>> - Round the scaled coordinates down to nearest integer. >>> - Modify src texture coordinates clipping to account for scaling.. >>> - Linear filter is not yet implemented in blorp. So, don't use >>> blorp engine to do single sampled scaled blitting. >>> >>> Note: Observed no piglit regressions on sandybridge & ivybridge with >>> these changes. >>> >>> Signed-off-by: Anuj Phogat <anuj.pho...@gmail.com> >>> --- >>> src/mesa/drivers/dri/i965/brw_blorp.h | 23 ++-- >>> src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 143 >>> +++++++++++++++---------- >>> src/mesa/drivers/dri/i965/brw_reg.h | 7 -- >>> src/mesa/drivers/dri/intel/intel_mipmap_tree.c | 2 + >>> 4 files changed, 102 insertions(+), 73 deletions(-) >>> >>> diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h >>> b/src/mesa/drivers/dri/i965/brw_blorp.h >>> index 70e3933..a40324b 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_blorp.h >>> +++ b/src/mesa/drivers/dri/i965/brw_blorp.h >>> @@ -40,9 +40,10 @@ brw_blorp_blit_miptrees(struct intel_context *intel, >>> unsigned src_level, unsigned src_layer, >>> struct intel_mipmap_tree *dst_mt, >>> unsigned dst_level, unsigned dst_layer, >>> - int src_x0, int src_y0, >>> - int dst_x0, int dst_y0, >>> - int dst_x1, int dst_y1, >>> + float src_x0, float src_y0, >>> + float src_x1, float src_y1, >>> + float dst_x0, float dst_y0, >>> + float dst_x1, float dst_y1, >>> bool mirror_x, bool mirror_y); >>> >>> bool >>> @@ -158,11 +159,11 @@ public: >>> >>> struct brw_blorp_coord_transform_params >>> { >>> - void setup(GLuint src0, GLuint dst0, GLuint dst1, >>> + void setup(GLfloat src0, GLfloat src1, GLfloat dst0, GLfloat dst1, >>> bool mirror); >>> >>> - int32_t multiplier; >>> - int32_t offset; >>> + float multiplier; >>> + float offset; >>> }; >>> >>> >>> @@ -304,6 +305,9 @@ struct brw_blorp_blit_prog_key >>> * than one sample per pixel. >>> */ >>> bool persample_msaa_dispatch; >>> + >>> + /* True for scaled blitting. */ >>> + bool blit_scaled; >>> }; >>> >>> class brw_blorp_blit_params : public brw_blorp_params >>> @@ -314,9 +318,10 @@ public: >>> unsigned src_level, unsigned src_layer, >>> struct intel_mipmap_tree *dst_mt, >>> unsigned dst_level, unsigned dst_layer, >>> - GLuint src_x0, GLuint src_y0, >>> - GLuint dst_x0, GLuint dst_y0, >>> - GLuint width, GLuint height, >>> + GLfloat src_x0, GLfloat src_y0, >>> + GLfloat src_x1, GLfloat src_y1, >>> + GLfloat dst_x0, GLfloat dst_y0, >>> + GLfloat dst_x1, GLfloat dst_y1, >>> bool mirror_x, bool mirror_y); >>> >>> virtual uint32_t get_wm_prog(struct brw_context *brw, >>> diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp >>> b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp >>> index b7ee92b..19169ef 100644 >>> --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp >>> +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp >>> @@ -41,11 +41,11 @@ >>> * If coord0 > coord1, swap them and invert the "mirror" boolean. >>> */ >>> static inline void >>> -fixup_mirroring(bool &mirror, GLint &coord0, GLint &coord1) >>> +fixup_mirroring(bool &mirror, GLfloat &coord0, GLfloat &coord1) >>> { >>> if (coord0 > coord1) { >>> mirror = !mirror; >>> - GLint tmp = coord0; >>> + GLfloat tmp = coord0; >>> coord0 = coord1; >>> coord1 = tmp; >>> } >>> @@ -67,9 +67,10 @@ fixup_mirroring(bool &mirror, GLint &coord0, GLint >>> &coord1) >>> * coordinates, by swapping the roles of src and dst. >>> */ >>> static inline bool >>> -clip_or_scissor(bool mirror, GLint &src_x0, GLint &src_x1, GLint &dst_x0, >>> - GLint &dst_x1, GLint fb_xmin, GLint fb_xmax) >>> +clip_or_scissor(bool mirror, GLfloat &src_x0, GLfloat &src_x1, GLfloat >>> &dst_x0, >>> + GLfloat &dst_x1, GLfloat fb_xmin, GLfloat fb_xmax) >>> { >>> + float scale = (float) (src_x1 - src_x0) / (dst_x1 - dst_x0); >>> /* If we are going to scissor everything away, stop. */ >>> if (!(fb_xmin < fb_xmax && >>> dst_x0 < fb_xmax && >>> @@ -105,8 +106,8 @@ clip_or_scissor(bool mirror, GLint &src_x0, GLint >>> &src_x1, GLint &dst_x0, >>> /* Adjust the source rectangle to remove the pixels corresponding to >>> those >>> * that were clipped/scissored out of the destination rectangle. >>> */ >>> - src_x0 += pixels_clipped_left; >>> - src_x1 -= pixels_clipped_right; >>> + src_x0 += pixels_clipped_left * scale; >>> + src_x1 -= pixels_clipped_right * scale; >>> >>> return true; >>> } >>> @@ -127,9 +128,10 @@ brw_blorp_blit_miptrees(struct intel_context *intel, >>> unsigned src_level, unsigned src_layer, >>> struct intel_mipmap_tree *dst_mt, >>> unsigned dst_level, unsigned dst_layer, >>> - int src_x0, int src_y0, >>> - int dst_x0, int dst_y0, >>> - int dst_x1, int dst_y1, >>> + float src_x0, float src_y0, >>> + float src_x1, float src_y1, >>> + float dst_x0, float dst_y0, >>> + float dst_x1, float dst_y1, >>> bool mirror_x, bool mirror_y) >>> { >>> intel_miptree_slice_resolve_depth(intel, src_mt, src_level, >>> src_layer); >>> @@ -139,6 +141,7 @@ brw_blorp_blit_miptrees(struct intel_context *intel, >>> src_mt, src_level, src_layer, >>> dst_mt, dst_level, dst_layer, >>> src_x0, src_y0, >>> + src_x1, src_y1, >>> dst_x0, dst_y0, >>> dst_x1, dst_y1, >>> mirror_x, mirror_y); >>> @@ -151,8 +154,8 @@ static void >>> do_blorp_blit(struct intel_context *intel, GLbitfield buffer_bit, >>> struct intel_renderbuffer *src_irb, >>> struct intel_renderbuffer *dst_irb, >>> - GLint srcX0, GLint srcY0, >>> - GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, >>> + GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1, >>> + GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1, >>> bool mirror_x, bool mirror_y) >>> { >>> /* Find source/dst miptrees */ >>> @@ -163,7 +166,8 @@ do_blorp_blit(struct intel_context *intel, GLbitfield >>> buffer_bit, >>> brw_blorp_blit_miptrees(intel, >>> src_mt, src_irb->mt_level, src_irb->mt_layer, >>> dst_mt, dst_irb->mt_level, dst_irb->mt_layer, >>> - srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, >>> + srcX0, srcY0, srcX1, srcY1, >>> + dstX0, dstY0, dstX1, dstY1, >>> mirror_x, mirror_y); >>> >>> intel_renderbuffer_set_needs_downsample(dst_irb); >>> @@ -203,8 +207,8 @@ formats_match(GLbitfield buffer_bit, struct >>> intel_renderbuffer *src_irb, >>> >>> static bool >>> try_blorp_blit(struct intel_context *intel, >>> - GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, >>> - GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, >>> + GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat >>> srcY1, >>> + GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat >>> dstY1, >>> GLenum filter, GLbitfield buffer_bit) >>> { >>> struct gl_context *ctx = &intel->ctx; >>> @@ -224,9 +228,13 @@ try_blorp_blit(struct intel_context *intel, >>> fixup_mirroring(mirror_y, srcY0, srcY1); >>> fixup_mirroring(mirror_y, dstY0, dstY1); >>> >>> - /* Make sure width and height match */ >>> - if (srcX1 - srcX0 != dstX1 - dstX0) return false; >>> - if (srcY1 - srcY0 != dstY1 - dstY0) return false; >>> + /* Linear filtering is not yet implemented in blorp. So, do not use >>> blorp >>> + * engine for single sampled scaled blits. >>> + */ >>> + if ((srcX1 - srcX0 != dstX1 - dstX0 || >>> + srcY1 - srcY0 != dstY1 - dstY0) && >>> + read_fb->Visual.samples == 0) >> >> >> It's a little strange that your comment mentions linear filtering but the >> code doesn't. How about this instead (I believe it's equivalent)? >> >> if ((srcX1 - srcX0 != dstX1 - dstX0 || >> srcY1 - srcY0 != dstY1 - dstY0) && >> filter == GL_LINEAR) >> > Yeah. This is what I intended to write initially. But making this change > causes > a piglit subtest fail. > ./bin/fbo-blit-stretch -auto > 45x79 (13, 33)-(30, 44) => 200x150 (19, 23)-(87, 67) stretch_x stretch_y > nearest > Probe at (55,23) > Expected: 0.000000 1.000000 0.000000 > Observed: 1.000000 0.000000 0.000000 > > Test seems to fail at the edge pixel. Other sub tests with stretch_x stretch_y > nearest passes. This sub test looks too strict at the edges. > No failures are observed in gles3 CTS with above change. > I have three options now: > 1. Hold the patch and investigate the failing piglit test case. > 2. Push the patch without suggested change. > 3. Relying on gles3 CTS results, push the patch with suggested change. > I'm little inclined towards 3. What do you think? > I missed a failure in es3 conform framebuffer_blit_functionality_magnifying_blit.test :( It looks like a genuine bug in my implementation. I'll investigate it before pushing the patch.
>>> >>> + return false; >>> >>> /* If the destination rectangle needs to be clipped or scissored, do >>> so. >>> */ >>> @@ -278,7 +286,8 @@ try_blorp_blit(struct intel_context *intel, >>> dst_irb = >>> intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[i]); >>> if (dst_irb) >>> do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, >>> srcY0, >>> - dstX0, dstY0, dstX1, dstY1, mirror_x, >>> mirror_y); >>> + srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, >>> + mirror_x, mirror_y); >>> } >>> break; >>> case GL_DEPTH_BUFFER_BIT: >>> @@ -289,7 +298,8 @@ try_blorp_blit(struct intel_context *intel, >>> if (!formats_match(buffer_bit, src_irb, dst_irb)) >>> return false; >>> do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0, >>> - dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y); >>> + srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, >>> + mirror_x, mirror_y); >>> break; >>> case GL_STENCIL_BUFFER_BIT: >>> src_irb = >>> @@ -299,7 +309,8 @@ try_blorp_blit(struct intel_context *intel, >>> if (!formats_match(buffer_bit, src_irb, dst_irb)) >>> return false; >>> do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0, >>> - dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y); >>> + srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, >>> + mirror_x, mirror_y); >>> break; >>> default: >>> assert(false); >>> @@ -347,6 +358,7 @@ brw_blorp_copytexsubimage(struct intel_context *intel, >>> */ >>> >>> int srcY1 = srcY0 + height; >>> + int srcX1 = srcX0 + width; >>> int dstX1 = dstX0 + width; >>> int dstY1 = dstY0 + height; >>> >>> @@ -364,7 +376,8 @@ brw_blorp_copytexsubimage(struct intel_context *intel, >>> brw_blorp_blit_miptrees(intel, >>> src_mt, src_irb->mt_level, src_irb->mt_layer, >>> dst_mt, dst_image->Level, dst_image->Face, >>> - srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, >>> + srcX0, srcY0, srcX1, srcY1, >>> + dstX0, dstY0, dstX1, dstY1, >>> false, mirror_y); >>> >>> /* If we're copying to a packed depth stencil texture and the source >>> @@ -380,7 +393,8 @@ brw_blorp_copytexsubimage(struct intel_context *intel, >>> brw_blorp_blit_miptrees(intel, >>> src_irb->mt, src_irb->mt_level, >>> src_irb->mt_layer, >>> dst_mt, dst_image->Level, dst_image->Face, >>> - srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, >>> + srcX0, srcY0, srcX1, srcY1, >>> + dstX0, dstY0, dstX1, dstY1, >>> false, mirror_y); >>> } >>> >>> @@ -590,7 +604,7 @@ private: >>> void encode_msaa(unsigned num_samples, intel_msaa_layout layout); >>> void decode_msaa(unsigned num_samples, intel_msaa_layout layout); >>> void kill_if_outside_dst_rect(); >>> - void translate_dst_to_src(unsigned intel_gen); >>> + void translate_dst_to_src(); >>> void single_to_blend(); >>> void manual_blend(unsigned num_samples); >>> void sample(struct brw_reg dst); >>> @@ -772,7 +786,7 @@ brw_blorp_blit_program::compile(struct brw_context >>> *brw, >>> kill_if_outside_dst_rect(); >>> >>> /* Next, apply a translation to obtain coordinates in the source >>> image. */ >>> - translate_dst_to_src(brw->intel.gen); >>> + translate_dst_to_src(); >>> >>> /* If the source image is not multisampled, then we want to fetch >>> sample >>> * number 0, because that's the only sample there is. >>> @@ -844,7 +858,7 @@ brw_blorp_blit_program::alloc_push_const_regs(int >>> base_reg) >>> #define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name) >>> #define ALLOC_REG(name) \ >>> this->name = \ >>> - brw_ud1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / >>> 4) >>> + brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / >>> 4) >>> >>> ALLOC_REG(dst_x0); >>> ALLOC_REG(dst_x1); >>> @@ -1322,29 +1336,37 @@ brw_blorp_blit_program::kill_if_outside_dst_rect() >>> * coordinates. >>> */ >>> void >>> -brw_blorp_blit_program::translate_dst_to_src(unsigned intel_gen) >>> +brw_blorp_blit_program::translate_dst_to_src() >>> { >>> + struct brw_reg X_f = retype(X, BRW_REGISTER_TYPE_F); >>> + struct brw_reg Y_f = retype(Y, BRW_REGISTER_TYPE_F); >>> + struct brw_reg Xp_f = retype(Xp, BRW_REGISTER_TYPE_F); >>> + struct brw_reg Yp_f = retype(Yp, BRW_REGISTER_TYPE_F); >>> + >>> brw_set_compression_control(&func, BRW_COMPRESSION_COMPRESSED); >>> - /* For mul instruction: >>> - * On SNB when both src0 and src1 are of type D or UD, only the low 16 >>> bits >>> - * of each element of src0 are used. >>> - * On IVB when both src0 and src1 are of type D or UD, only the low 16 >>> bits >>> - * of each element of src1 are used. >>> - * multiplier can be positive or negative. So keep the multiplier in a >>> src >>> - * register which don't get truncated during multiplication. >>> - */ >>> - if (intel_gen == 6) { >>> - brw_MUL(&func, Xp, X, x_transform.multiplier); >>> - brw_MUL(&func, Yp, Y, y_transform.multiplier); >>> + /* Move the UD coordinates to float registers. */ >>> + brw_MOV(&func, Xp_f, X); >>> + brw_MOV(&func, Yp_f, Y); >>> + /* Scale and offset */ >>> + brw_MUL(&func, X_f, Xp_f, x_transform.multiplier); >>> + brw_MUL(&func, Y_f, Yp_f, y_transform.multiplier); >>> + brw_ADD(&func, X_f, X_f, x_transform.offset); >>> + brw_ADD(&func, Y_f, Y_f, y_transform.offset); >>> + if (key->blit_scaled) { >>> + /* Round the float coordinates down to nearest integer. */ >>> + brw_RNDD(&func, Xp_f, X_f); >>> + brw_RNDD(&func, Yp_f, Y_f); >>> + /* Move the float coordinates back to UD registers. */ >>> + brw_MOV(&func, X, Xp_f); >>> + brw_MOV(&func, Y, Yp_f); >>> } >>> else { >>> - brw_MUL(&func, Xp, x_transform.multiplier, X); >>> - brw_MUL(&func, Yp, y_transform.multiplier, Y); >>> + /* Move the float coordinates back to UD registers. */ >>> + brw_MOV(&func, Xp, X_f); >>> + brw_MOV(&func, Yp, Y_f); >>> + SWAP_XY_AND_XPYP(); >>> } >> >> >> I don't believe key->blit_scaled is needed. Both branches of the if >> statement compute the same thing, since ((int) f) == ((int) RNDD(f)). Let's >> just use the code in the "else" block, since that's fewer instructions. >> > right. consider it fixed. >>> >>> - brw_ADD(&func, Xp, Xp, x_transform.offset); >>> - brw_ADD(&func, Yp, Yp, y_transform.offset); >>> brw_set_compression_control(&func, BRW_COMPRESSION_NONE); >>> - SWAP_XY_AND_XPYP(); >>> } >>> >>> /** >>> @@ -1709,29 +1731,30 @@ brw_blorp_blit_program::render_target_write() >>> >>> >>> void >>> -brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint >>> dst1, >>> +brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1, >>> + GLfloat dst0, GLfloat dst1, >>> bool mirror) >>> { >>> + float scale = (src1 - src0) / (dst1 - dst0); >>> if (!mirror) { >>> /* When not mirroring a coordinate (say, X), we need: >>> - * x' - src_x0 = x - dst_x0 >>> + * x' - src_x0 = x - dst_x0 + 0.5 >>> * Therefore: >>> - * x' = 1*x + (src_x0 - dst_x0) >>> + * x' = 1*x + (src_x0 - dst_x0 + 0.5) >>> */ >>> - multiplier = 1; >>> - offset = (int) (src0 - dst0); >>> + multiplier = scale; >>> + offset = (src0 - dst0 + 0.5) * scale; >>> } else { >>> /* When mirroring X we need: >>> - * x' - src_x0 = dst_x1 - x - 1 >>> + * x' - src_x0 = dst_x1 - x - 0.5 >>> * Therefore: >>> - * x' = -1*x + (src_x0 + dst_x1 - 1) >>> + * x' = -1*x + (src_x0 + dst_x1 - 0.5) >>> */ >>> - multiplier = -1; >>> - offset = src0 + dst1 - 1; >>> + multiplier = -scale; >>> + offset = (src0 + dst1 - 0.5) * scale; >>> } >>> } >> >> >> It would be nice to have a comment in the above function saying why the >> 0.5's are necessary. If I'm not mistaken, the reason they are necessary is >> that the blorp program uses "round toward zero" to convert the transformed >> floating point coordinates to integer coordinates, whereas the behaviour we >> actually want is "round to nearest", so 0.5 provides the necessary >> correction. >> > I'll put this comment in the code. >>> >>> >>> - >>> /** >>> * Determine which MSAA layout the GPU pipeline should be configured for, >>> * based on the chip generation, the number of samples, and the true >>> layout of >>> @@ -1766,9 +1789,10 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct >>> brw_context *brw, >>> unsigned src_level, unsigned >>> src_layer, >>> struct intel_mipmap_tree >>> *dst_mt, >>> unsigned dst_level, unsigned >>> dst_layer, >>> - GLuint src_x0, GLuint >>> src_y0, >>> - GLuint dst_x0, GLuint >>> dst_y0, >>> - GLuint dst_x1, GLuint >>> dst_y1, >>> + GLfloat src_x0, GLfloat >>> src_y0, >>> + GLfloat src_x1, GLfloat >>> src_y1, >>> + GLfloat dst_x0, GLfloat >>> dst_y0, >>> + GLfloat dst_x1, GLfloat >>> dst_y1, >>> bool mirror_x, bool >>> mirror_y) >>> { >>> src.set(brw, src_mt, src_level, src_layer); >>> @@ -1834,6 +1858,11 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct >>> brw_context *brw, >>> wm_prog_key.persample_msaa_dispatch = true; >>> } >>> >>> + /* Scaled blitting or not. */ >>> + wm_prog_key.blit_scaled = >>> + ((dst_x1 - dst_x0) == (src_x1 - src_x0) && >>> + (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true; >>> + >> >> >> This should also be able to go away. >> >> With that fixed, this patch is: >> >> Reviewed-by: Paul Berry <stereotype...@gmail.com> >> > Thanks for reviewing Paul. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev