On 29 January 2013 00:36, Kenneth Graunke <kenn...@whitecape.org> wrote:
> The BLT engine has many limitations. Currently, it can only blit > X-tiled buffers (since we don't have a kernel API to whack the BLT > tiling mode register), which means all depth/stencil operations get > punted to meta code, which can be very CPU-intensive. > > Even if we used the BLT engine, it can't blit between buffers with > different tiling modes, such as an X-tiled non-MSAA ARGB8888 texture > and a Y-tiled CMS ARGB8888 renderbuffer. This is a fundamental > limitation, and the only way around that is to use BLORP. > > Previously, BLORP only handled BlitFramebuffer. This patch adds an > additional frontend for doing CopyTexSubImage. It also makes it the > default. This is partly to increase testing and avoid hiding bugs, > and partly because the BLORP path can already handle more cases. With > trivial extensions, it should be able to handle everything the BLT can. > > This helps PlaneShift massively, which tries to CopyTexSubImage2D > between depth buffers whenever a player casts a spell. Since these > are Y-tiled, we hit meta and software ReadPixels paths, eating 99% CPU > while delivering ~1 FPS. This is particularly bad in an MMO setting > because people cast spells all the time. > > It also helps Xonotic in 4X MSAA mode. At default power management > settings, I measured a 6.35138% +/- 0.672548% performance boost (n=5). > (This data is from v1 of the patch.) > > No Piglit regressions on Ivybridge (v3) or Sandybridge (v2). > > v2: Create a fake intel_renderbuffer to wrap the destination texture > image and then reuse do_blorp_blit rather than reimplementing most > of it. Remove unnecessary clipping code and conditional rendering > check. > > v3: Reuse formats_match() to centralize checks; delete temporary > renderbuffers. Reorganize the code. > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > Cc: Paul Berry <stereotype...@gmail.com> > Cc: Chad Versace <chad.vers...@linux.intel.com> > Reviewed-and-tested-by: Carl Worth <cwo...@cworth.org> [v2] > Should this be a candidate for the 9.1 branch? > --- > src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 73 > ++++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_context.h | 8 +++ > src/mesa/drivers/dri/intel/intel_fbo.c | 30 ++++++++++++ > src/mesa/drivers/dri/intel/intel_fbo.h | 4 ++ > src/mesa/drivers/dri/intel/intel_tex_copy.c | 32 +++++++++--- > 5 files changed, 139 insertions(+), 8 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > index bc7916a..b037156 100644 > --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > @@ -23,6 +23,7 @@ > > #include "main/teximage.h" > #include "main/fbobject.h" > +#include "main/renderbuffer.h" > > #include "glsl/ralloc.h" > > @@ -295,6 +296,78 @@ try_blorp_blit(struct intel_context *intel, > return true; > } > > +bool > +brw_blorp_copytexsubimage(struct intel_context *intel, > + struct gl_renderbuffer *src_rb, > + struct gl_texture_image *dst_image, > + int srcX0, int srcY0, > + int dstX0, int dstY0, > + int width, int height) > +{ > + struct gl_context *ctx = &intel->ctx; > + struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb); > + struct intel_renderbuffer *dst_irb; > + > + /* BLORP is not supported before Gen6. */ > + if (intel->gen < 6) > + return false; > + > + /* Create a fake/wrapper renderbuffer to allow us to use > do_blorp_blit(). */ > + dst_irb = intel_create_fake_renderbuffer_wrapper(intel, dst_image); > + if (!dst_irb) > + return false; > + > + struct gl_renderbuffer *dst_rb = &dst_irb->Base.Base; > + > + /* We don't really have a buffer bit, but at this point it's only used > by > + * find_miptree() to decide whether to dereference the stencil miptree. > + * Since there are no stencil textures, we don't want to. 0 should > work. > + */ > + GLbitfield buffer_bit = 0; > We just talked about this in person and concluded that this doesn't work. It's possible to have combined depth/stencil buffers, and since they're usually represented as separate buffers in the hardware, I think that means that in the depth/stencil case we actually need to do two blits. > + > + if (!formats_match(buffer_bit, src_irb, dst_irb)) { > + _mesa_delete_renderbuffer(ctx, dst_rb); > + return false; > + } > + > + /* Source clipping shouldn't be necessary, since copytexsubimage (in > + * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which > + * takes care of it. > + * > + * Destination clipping shouldn't be necessary since the restrictions > on > + * glCopyTexSubImage prevent the user from specifying a destination > rectangle > + * that falls outside the bounds of the destination texture. > + * See error_check_subtexture_dimensions(). > + */ > + > + int srcY1 = srcY0 + height; > + int dstX1 = dstX0 + width; > + int dstY1 = dstY0 + height; > + > + /* Sync up the state of window system buffers. We need to do this > before > + * we go looking for the buffers. > + */ > + intel_prepare_render(intel); > + > + /* Account for the fact that in the system framebuffer, the origin is > at > + * the lower left. > + */ > + bool mirror_y = false; > + if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) { > + GLint tmp = src_rb->Height - srcY0; > + srcY0 = src_rb->Height - srcY1; > + srcY1 = tmp; > + mirror_y = true; > + } > + > + do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, > + srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, false, > mirror_y); > + > + _mesa_delete_renderbuffer(ctx, dst_rb); > + return true; > +} > + > + > GLbitfield > brw_blorp_framebuffer(struct intel_context *intel, > GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 620f09f..324bb1d 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1217,6 +1217,14 @@ brw_blorp_framebuffer(struct intel_context *intel, > GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, > GLbitfield mask, GLenum filter); > > +bool > +brw_blorp_copytexsubimage(struct intel_context *intel, > + struct gl_renderbuffer *src_rb, > + struct gl_texture_image *dst_image, > + int srcX0, int srcY0, > + int dstX0, int dstY0, > + int width, int height); > + > /* gen6_multisample_state.c */ > void > gen6_emit_3dstate_multisample(struct brw_context *brw, > diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c > b/src/mesa/drivers/dri/intel/intel_fbo.c > index 4810809..37ecbd1 100644 > --- a/src/mesa/drivers/dri/intel/intel_fbo.c > +++ b/src/mesa/drivers/dri/intel/intel_fbo.c > @@ -531,6 +531,36 @@ intel_renderbuffer_update_wrapper(struct > intel_context *intel, > return true; > } > > +/** > + * Create a fake intel_renderbuffer that wraps a gl_texture_image. > + */ > +struct intel_renderbuffer * > +intel_create_fake_renderbuffer_wrapper(struct intel_context *intel, > + struct gl_texture_image *image) > +{ > + struct gl_context *ctx = &intel->ctx; > + struct intel_renderbuffer *irb; > + struct gl_renderbuffer *rb; > + > + irb = CALLOC_STRUCT(intel_renderbuffer); > + if (!irb) { > + _mesa_error(ctx, GL_OUT_OF_MEMORY, "creating renderbuffer"); > + return NULL; > + } > + > + rb = &irb->Base.Base; > + > + _mesa_init_renderbuffer(rb, 0); > + rb->ClassID = INTEL_RB_CLASS; > + > + if (!intel_renderbuffer_update_wrapper(intel, irb, image, > image->Face)) { > + intel_delete_renderbuffer(ctx, rb); > + return NULL; > + } > + > + return irb; > +} > + > void > intel_renderbuffer_set_draw_offset(struct intel_renderbuffer *irb) > { > diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h > b/src/mesa/drivers/dri/intel/intel_fbo.h > index 9c48e9c..f135dea 100644 > --- a/src/mesa/drivers/dri/intel/intel_fbo.h > +++ b/src/mesa/drivers/dri/intel/intel_fbo.h > @@ -140,6 +140,10 @@ intel_create_wrapped_renderbuffer(struct gl_context * > ctx, > int width, int height, > gl_format format); > > +struct intel_renderbuffer * > +intel_create_fake_renderbuffer_wrapper(struct intel_context *intel, > + struct gl_texture_image *image); > + > extern void > intel_fbo_init(struct intel_context *intel); > > diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c > b/src/mesa/drivers/dri/intel/intel_tex_copy.c > index c9cbcf4..5acdb42 100644 > --- a/src/mesa/drivers/dri/intel/intel_tex_copy.c > +++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c > @@ -41,6 +41,9 @@ > #include "intel_fbo.h" > #include "intel_tex.h" > #include "intel_blit.h" > +#ifndef I915 > +#include "brw_context.h" > +#endif > > #define FILE_DEBUG_FLAG DEBUG_TEXTURE > > @@ -177,15 +180,28 @@ intelCopyTexSubImage(struct gl_context *ctx, GLuint > dims, > GLint x, GLint y, > GLsizei width, GLsizei height) > { > - if (dims == 3 || !intel_copy_texsubimage(intel_context(ctx), > - intel_texture_image(texImage), > - xoffset, yoffset, > - intel_renderbuffer(rb), x, y, width, > height)) { > - fallback_debug("%s - fallback to swrast\n", __FUNCTION__); > - _mesa_meta_CopyTexSubImage(ctx, dims, texImage, > - xoffset, yoffset, zoffset, > - rb, x, y, width, height); > + struct intel_context *intel = intel_context(ctx); > + if (dims != 3) { > +#ifndef I915 > + /* Try BLORP first. It can handle almost everything. */ > + if (brw_blorp_copytexsubimage(intel, rb, texImage, x, y, > + xoffset, yoffset, width, height)) > + return; > +#endif > + > + /* Next, try the BLT engine. */ > + if (intel_copy_texsubimage(intel_context(ctx), > + intel_texture_image(texImage), > + xoffset, yoffset, > + intel_renderbuffer(rb), x, y, width, > height)) > + return; > } > + > + /* Finally, fall back to meta. This will likely be slow. */ > + fallback_debug("%s - fallback to swrast\n", __FUNCTION__); > + _mesa_meta_CopyTexSubImage(ctx, dims, texImage, > + xoffset, yoffset, zoffset, > + rb, x, y, width, height); > } > > > -- > 1.8.1.2 > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev