NEWS | 30 +++ configure.ac | 22 ++ src/intel_display.c | 77 ++++++++- src/intel_dri.c | 14 + src/sna/blt.c | 4 src/sna/compiler.h | 6 src/sna/gen4_render.c | 70 +++++--- src/sna/gen4_vertex.c | 354 ++++++++++++++++++++++++++++++++++++++++++- src/sna/gen5_render.c | 9 - src/sna/gen6_render.c | 9 - src/sna/gen7_render.c | 25 +-- src/sna/gen7_render.h | 2 src/sna/kgem.c | 47 +++-- src/sna/kgem.h | 2 src/sna/sna_accel.c | 54 ++++-- src/sna/sna_cpu.c | 24 ++ src/sna/sna_display.c | 78 +++++++++ src/sna/sna_dri.c | 45 +++-- src/sna/sna_io.c | 25 --- src/sna/sna_render.c | 2 src/sna/sna_video_textured.c | 5 21 files changed, 767 insertions(+), 137 deletions(-)
New commits: commit 678279eb373310f1a71a3d74e5a500b343e98830 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Sat Apr 6 15:58:50 2013 +0100 2.21.6 release diff --git a/NEWS b/NEWS index 05a20fa..4fc9a6b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,33 @@ +Release 2.21.6 (2013-04-06) +=========================== +A surprising highlight of this release is a little refresh to the KMS +support for OpenBSD. OpenBSD now has its own KMS implementation which is +mostly compatible with the interface in Linux, with one or two tweaks +supplied by Mark Kettenis. This release continues to cleanup behaviour +for Haswell. + + * Workaround a failure by the xserver to invalidate DRI buffers + following a pixmap change for XComposite redirection. + https://bugs.freedesktop.org/show_bug.cgi?id=62614 + + * Fix computation of clip extents for stippling + https://bugs.freedesktop.org/show_bug.cgi?id=62618 + + * Support KMS on OpenBSD, by Mark Kettenis + + * Clean up sockets upon CloseScreen (making ourselves better behaved + for muxed setups). + + * Fix the tests for AVX/AVX2 support in CPUID and remember to check for + OS support as well. + + * Report a monotonic UST value for undisplayed drawables rather than 0 + by Daniel Kurtz + + * Fix video playback on gen4 through a complex clip (more gen4 GPU woes) + https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1162046 + + Release 2.21.5 (2013-03-21) =========================== Haswell reintroduces a command to load the scanline window from the diff --git a/configure.ac b/configure.ac index 794b383..fa82507 100644 --- a/configure.ac +++ b/configure.ac @@ -23,7 +23,7 @@ # Initialize Autoconf AC_PREREQ([2.60]) AC_INIT([xf86-video-intel], - [2.21.5], + [2.21.6], [https://bugs.freedesktop.org/enter_bug.cgi?product=xorg], [xf86-video-intel]) AC_CONFIG_SRCDIR([Makefile.am]) commit 5332d5a7e055042233e279385bfe1388adfe15fa Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Sat Apr 6 15:42:23 2013 +0100 configure: Allow valgrind support to be manually enabled Irrespective of the DDX debug settings, some people wish to run Xorg under valgrind and so prefer to have the cleaner output by making the DDX valgrind aware. (Actually Maarten wants valgrind support enabled by default...) Suggested-by: Maarten Lankhorst <maarten.lankho...@canonical.com> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/configure.ac b/configure.ac index 8b12d01..794b383 100644 --- a/configure.ac +++ b/configure.ac @@ -353,6 +353,12 @@ AC_ARG_ENABLE(debug, [Enables internal debugging [default=no]]), [DEBUG="$enableval"], [DEBUG=no]) +AC_ARG_ENABLE(valgrind, + AS_HELP_STRING([--enable-valgrind], + [Enables valgrindified ioctls for debugging [default=no]]), + [VG="$enableval"], + [VG=no]) + # Store the list of server defined optional extensions in REQUIRED_MODULES XORG_DRIVER_CHECK_EXT(RANDR, randrproto) XORG_DRIVER_CHECK_EXT(RENDER, renderproto) @@ -439,12 +445,22 @@ AM_CONDITIONAL(DEBUG, test x$DEBUG != xno) AM_CONDITIONAL(FULL_DEBUG, test x$DEBUG = xfull) if test "x$DEBUG" = xno; then AC_DEFINE(NDEBUG,1,[Disable internal debugging]) +else + if test "x$VG" != xyes; then + VG=auto + fi fi -if test "x$DEBUG" != xno; then +if test "x$VG" != xno; then PKG_CHECK_MODULES(VALGRIND, [valgrind], have_valgrind=yes, have_valgrind=no) + AC_MSG_CHECKING([whether to include valgrind support]) if test x$have_valgrind = xyes; then AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warnings]) + else + if test "x$VG" = xyes; then + AC_MSG_ERROR([valgrind support requested, but valgrind-dev headers not found]) + fi fi + AC_MSG_RESULT([$have_valgrind ($VG)]) fi if test "x$DEBUG" = xsync; then AC_DEFINE(DEBUG_SYNC,1,[Enable synchronous rendering for debugging]) commit 091cf6f0477824c5826547c02394752b6dc944ce Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Sat Apr 6 09:30:57 2013 +0100 sna: Improve assertions to detect rogue priv->cpu status Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c index 9bc6fe3..217a4a2 100644 --- a/src/sna/sna_accel.c +++ b/src/sna/sna_accel.c @@ -340,6 +340,10 @@ static void assert_pixmap_damage(PixmapPtr p) return; } + if (DAMAGE_IS_ALL(priv->gpu_damage)) { + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); + } + assert(!DAMAGE_IS_ALL(priv->gpu_damage) || priv->cpu_damage == NULL); assert(!DAMAGE_IS_ALL(priv->cpu_damage) || priv->gpu_damage == NULL); @@ -1788,6 +1792,7 @@ mark_damage: } done: + assert(priv->gpu_damage == NULL); if (flags & MOVE_WRITE) { assert(DAMAGE_IS_ALL(priv->cpu_damage)); priv->source_count = SOURCE_BIAS; @@ -1898,6 +1903,7 @@ static inline bool region_inplace(struct sna *sna, if (DAMAGE_IS_ALL(priv->gpu_damage)) { DBG(("%s: yes, already wholly damaged on the GPU\n", __FUNCTION__)); assert(priv->gpu_bo); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); return true; } @@ -1915,6 +1921,11 @@ static inline bool region_inplace(struct sna *sna, >= sna->kgem.half_cpu_cache_pages; } +static inline bool box_empty(const BoxRec *box) +{ + return box->x2 <= box->x1 || box->y2 <= box->y1; +} + bool sna_drawable_move_region_to_cpu(DrawablePtr drawable, RegionPtr region, @@ -1937,6 +1948,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable, assert_drawable_contains_box(drawable, ®ion->extents); } + if (box_empty(®ion->extents)) + return true; + priv = sna_pixmap(pixmap); if (priv == NULL) { DBG(("%s: not attached to %p\n", __FUNCTION__, pixmap)); @@ -2012,6 +2026,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable, pixmap->drawable.height)) { DBG(("%s: replaced entire pixmap, destroying CPU shadow\n", __FUNCTION__)); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); sna_damage_destroy(&priv->cpu_damage); list_del(&priv->list); } else @@ -2348,11 +2363,6 @@ out: return true; } -static inline bool box_empty(const BoxRec *box) -{ - return box->x2 <= box->x1 || box->y2 <= box->y1; -} - bool sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags) { @@ -2474,6 +2484,8 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl pixmap->drawable.width, pixmap->drawable.height)) { assert(priv->gpu_bo); + assert(priv->gpu_bo->proxy == NULL); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); sna_damage_destroy(&priv->cpu_damage); list_del(&priv->list); goto done; @@ -2715,6 +2727,8 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box, DBG(("%s: use GPU fast path (all-damaged)\n", __FUNCTION__)); assert(priv->cpu_damage == NULL); assert(priv->gpu_bo); + assert(priv->gpu_bo->proxy == NULL); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); goto use_gpu_bo; } @@ -2879,6 +2893,7 @@ done: if (sna_damage_is_all(&priv->gpu_damage, pixmap->drawable.width, pixmap->drawable.height)) { + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); sna_damage_destroy(&priv->cpu_damage); list_del(&priv->list); *damage = NULL; @@ -3121,9 +3136,10 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags) pixmap->drawable.height)) { DBG(("%s: already all-damaged\n", __FUNCTION__)); assert(priv->gpu_bo); + assert(priv->gpu_bo->proxy == NULL); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); sna_damage_destroy(&priv->cpu_damage); list_del(&priv->list); - assert(priv->cpu == false || IS_CPU_MAP(priv->gpu_bo->map)); goto active; } @@ -3279,8 +3295,10 @@ done: sna_damage_reduce_all(&priv->gpu_damage, pixmap->drawable.width, pixmap->drawable.height); - if (DAMAGE_IS_ALL(priv->gpu_damage)) + if (DAMAGE_IS_ALL(priv->gpu_damage)) { + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); sna_pixmap_free_cpu(sna, priv); + } active: if (flags & MOVE_WRITE) @@ -3945,6 +3963,7 @@ move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv, if (DAMAGE_IS_ALL(priv->gpu_damage)) { assert(priv->gpu_bo); + assert(priv->cpu == false || (priv->mapped && IS_CPU_MAP(priv->gpu_bo->map))); return true; } commit 4a43aa81e27e8a651fde8a4761fd14bd8824d90c Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Thu Apr 4 10:53:55 2013 +0100 sna: Restore bo->flush status for large bo Since we started discarding the flush flags on cached bo (in order to prevent DRI flush states leaking), we failed to preserve the flush flag for large bo (which uses it to keep batches trim and other hints). Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/kgem.c b/src/sna/kgem.c index c670dbb..4136ce9 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -3606,7 +3606,6 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem, assert(!bo->scanout); assert(bo->refcnt == 0); assert(bo->reusable); - assert(bo->flush == true); if (kgem->gen < 040) { if (bo->pitch < pitch) { @@ -3640,6 +3639,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem, bo->pitch, bo->tiling, bo->handle, bo->unique_id)); assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo)); bo->refcnt = 1; + bo->flush = true; return bo; } commit ed3dab44a717a1a88470228b5e33f20de1e4ad0d Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Apr 2 15:20:52 2013 +0100 sna: Adjust userptr structure for implicit padding Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/kgem.c b/src/sna/kgem.c index 9013e68..c670dbb 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -128,7 +128,7 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags); #define LOCAL_IOCTL_I915_GEM_USERPTR DRM_IOWR (DRM_COMMAND_BASE + LOCAL_I915_GEM_USERPTR, struct local_i915_gem_userptr) struct local_i915_gem_userptr { uint64_t user_ptr; - uint32_t user_size; + uint64_t user_size; uint32_t flags; #define I915_USERPTR_READ_ONLY (1<<0) #define I915_USERPTR_UNSYNCHRONIZED (1<<31) @@ -1482,6 +1482,7 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo) if (IS_USER_MAP(bo->map)) { assert(bo->rq == NULL); + assert(!__kgem_busy(kgem, bo->handle)); assert(MAP(bo->map) != bo || bo->io || bo->flush); if (!(bo->io || bo->flush)) { DBG(("%s: freeing snooped base\n", __FUNCTION__)); commit 4e2fc5aee035c3059ca33dbcafc71dc5988d6b09 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Apr 2 14:29:29 2013 +0100 sna: Relax scanline waits on HSW to be emittable from either ring My overzealous reading of the bspec lead me to the conclusion that the MI_LOAD_SCANLINES command was only available on the blitter ring. This is false, thankfully, and allows us to do vsync'ed Xv. Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c index 6d61650..0d32086 100644 --- a/src/sna/sna_display.c +++ b/src/sna/sna_display.c @@ -2887,15 +2887,13 @@ static bool sna_emit_wait_for_scanline_hsw(struct sna *sna, uint32_t event; uint32_t *b; - if (sna->kgem.mode != KGEM_BLT) - return false; - b = kgem_get_batch(&sna->kgem); sna->kgem.nbatch += 5; /* The documentation says that the LOAD_SCAN_LINES command * always comes in pairs. Don't ask me why. */ switch (pipe) { + default: assert(0); case 0: event = 0; break; case 1: event = 1 << 19; break; case 2: event = 4 << 19; break; @@ -2904,6 +2902,7 @@ static bool sna_emit_wait_for_scanline_hsw(struct sna *sna, b[3] = b[1] = (y1 << 16) | (y2-1); switch (pipe) { + default: assert(0); case 0: event = 0; break; case 1: event = 1 << 8; break; case 2: event = 1 << 14; break; commit 5a36fdcee769195d5c6e642e84a8976114e7c6de Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Apr 2 11:01:53 2013 +0100 sna/gen4: Kill stray debugging ErrorF from previous commit Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index c05b37b..69a5c77 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -1392,7 +1392,6 @@ gen4_render_video(struct sna *sna, n = gen4_get_rectangles(sna, &tmp, min(nbox, 16), gen4_video_bind_surfaces); - ErrorF("n=%d/%d\n", n, nbox); assert(n); nbox -= n; commit 3d7e16addb2fb5f35936aafe8e16685a91d30f59 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Apr 2 10:58:52 2013 +0100 sna/gen4: Break the Video rendering loop into 16 rectangle chunks If we feed more than 16 rectangles into the video rendering pipeline, the GPU goes crazy and starts emitting corruption. Lalalala. Bugzilla: https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1162046 Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index 1bf5ad2..c05b37b 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -1387,37 +1387,51 @@ gen4_render_video(struct sna *sna, box = REGION_RECTS(dstRegion); nbox = REGION_NUM_RECTS(dstRegion); - while (nbox--) { - BoxRec r; + do { + int n; - r.x1 = box->x1 + pix_xoff; - r.x2 = box->x2 + pix_xoff; - r.y1 = box->y1 + pix_yoff; - r.y2 = box->y2 + pix_yoff; + n = gen4_get_rectangles(sna, &tmp, min(nbox, 16), + gen4_video_bind_surfaces); + ErrorF("n=%d/%d\n", n, nbox); + assert(n); + nbox -= n; - gen4_get_rectangles(sna, &tmp, 1, gen4_video_bind_surfaces); + do { + BoxRec r; - OUT_VERTEX(r.x2, r.y2); - OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x); - OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y); + r.x1 = box->x1 + pix_xoff; + r.x2 = box->x2 + pix_xoff; + r.y1 = box->y1 + pix_yoff; + r.y2 = box->y2 + pix_yoff; - OUT_VERTEX(r.x1, r.y2); - OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x); - OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y); + OUT_VERTEX(r.x2, r.y2); + OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x); + OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y); - OUT_VERTEX(r.x1, r.y1); - OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x); - OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y); + OUT_VERTEX(r.x1, r.y2); + OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x); + OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y); - if (!DAMAGE_IS_ALL(priv->gpu_damage)) { - sna_damage_add_box(&priv->gpu_damage, &r); - sna_damage_subtract_box(&priv->cpu_damage, &r); - } - box++; - } - priv->clear = false; + OUT_VERTEX(r.x1, r.y1); + OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x); + OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y); - gen4_vertex_flush(sna); + if (!DAMAGE_IS_ALL(priv->gpu_damage)) { + sna_damage_add_box(&priv->gpu_damage, &r); + sna_damage_subtract_box(&priv->cpu_damage, &r); + } + box++; + } while (--n); + + gen4_vertex_flush(sna); + if (!nbox) + break; + + /* VUE corruption strikes again */ + OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH); + } while (1); + + priv->clear = false; return true; } diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c index bd20325..d94dbd8 100644 --- a/src/sna/sna_video_textured.c +++ b/src/sna/sna_video_textured.c @@ -230,6 +230,11 @@ sna_video_textured_put_image(ScrnInfoPtr scrn, drw_x, drw_y, drw_w, drw_h, id, width, height, sync)); + DBG(("%s: region %d:(%d, %d), (%d, %d)\n", __FUNCTION__, + RegionNumRects(clip), + clip->extents.x1, clip->extents.y1, + clip->extents.x2, clip->extents.y2)); + if (buf == 0) { DBG(("%s: garbage video buffer\n", __FUNCTION__)); return BadAlloc; commit f09aa788d79d36688bcfdd3b49b92367590c5f16 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Apr 2 10:01:21 2013 +0100 DRI2GetMSC: Do not send a bogus ust for when the drawable is not displayed According to the opengl glx_sync_control spec, the Unadjusted System Time (or UST) is a 64-bit monotonically increasing counter that is available throughout the system: http://www.opengl.org/registry/specs/OML/glx_sync_control.txt Therefore, sending 0, even in this corner case, is out of spec. However, we cannot just return FALSE here as that triggers a BadDrawable error to be sent, and as is often the case mishandled, to the client. This results in a certain compositor terminating, for example. As an alternative we can use the monotonic system timestamp which in theory should also be monotonic with the previous and subsequent vblank times. Based on a patch by Daniel Kurtz. Reported-by: Daniel Kurtz <djku...@chromium.org> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/intel_dri.c b/src/intel_dri.c index f351203..8f27921 100644 --- a/src/intel_dri.c +++ b/src/intel_dri.c @@ -1326,6 +1326,16 @@ blit_fallback: return TRUE; } +static uint64_t gettime_us(void) +{ + struct timespec tv; + + if (clock_gettime(CLOCK_MONOTONIC, &tv)) + return 0; + + return (uint64_t)tv.tv_sec * 1000000 + tv.tv_nsec / 1000; +} + /* * Get current frame count and frame count timestamp, based on drawable's * crtc. @@ -1339,9 +1349,9 @@ I830DRI2GetMSC(DrawablePtr draw, CARD64 *ust, CARD64 *msc) drmVBlank vbl; int ret, pipe = I830DRI2DrawablePipe(draw); - /* Drawable not displayed, make up a value */ + /* Drawable not displayed, make up a *monotonic* value */ if (pipe == -1) { - *ust = 0; + *ust = gettime_us(); *msc = 0; return TRUE; } diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c index 0962e25..5fb1662 100644 --- a/src/sna/sna_dri.c +++ b/src/sna/sna_dri.c @@ -37,6 +37,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. #endif #include <errno.h> +#include <time.h> #include <string.h> #include "sna.h" @@ -2216,6 +2217,16 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw, } #endif +static uint64_t gettime_us(void) +{ + struct timespec tv; + + if (clock_gettime(CLOCK_MONOTONIC, &tv)) + return 0; + + return (uint64_t)tv.tv_sec * 1000000 + tv.tv_nsec / 1000; +} + /* * Get current frame count and frame count timestamp, based on drawable's * crtc. @@ -2227,13 +2238,16 @@ sna_dri_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc) drmVBlank vbl; int pipe; - /* Drawable not displayed, make up a value */ - *ust = *msc = 0; pipe = sna_dri_get_pipe(draw); DBG(("%s(pipe=%d)\n", __FUNCTION__, pipe)); - if (pipe == -1) + if (pipe == -1) { +fail: + /* Drawable not displayed, make up a *monotonic* value */ + *ust = gettime_us(); + *msc = 0; return TRUE; + } VG_CLEAR(vbl); vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe); @@ -2246,6 +2260,7 @@ sna_dri_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc) } else { DBG(("%s: query failed on pipe %d, ret=%d\n", __FUNCTION__, pipe, errno)); + goto fail; } return TRUE; commit 4af622edfc18af523e1fa9063379f68374e19b04 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Mon Apr 1 22:44:13 2013 +0100 sna: Try to eliminate pending operations to the bo being replaced When we are replacing a bo with fresh data, we can drop pending operations to it and thereby reduce the complexity of the replacement. Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/kgem.c b/src/sna/kgem.c index 231dc8e..9013e68 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -1732,6 +1732,23 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags) return NULL; } +void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo) +{ + if (kgem->nexec != 1 || bo->exec == NULL) + return; + + DBG(("%s: only handle in batch, discarding last operations\n", + __FUNCTION__)); + + assert(bo->exec == &kgem->exec[0]); + assert(kgem->exec[0].handle == bo->handle); + assert(RQ(bo->rq) == kgem->next_request); + + bo->refcnt++; + kgem_reset(kgem); + bo->refcnt--; +} + static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo) { DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle)); @@ -1782,16 +1799,8 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo) assert(bo->io == false); assert(bo->scanout == false); - if (bo->exec && kgem->nexec == 1) { - DBG(("%s: only handle in batch, discarding last operations\n", - __FUNCTION__)); - assert(bo->exec == &kgem->exec[0]); - assert(kgem->exec[0].handle == bo->handle); - assert(RQ(bo->rq) == kgem->next_request); - bo->refcnt = 1; - kgem_reset(kgem); - bo->refcnt = 0; - } + kgem_bo_undo(kgem, bo); + assert(bo->refcnt == 0); if (bo->rq && bo->exec == NULL && !__kgem_busy(kgem, bo->handle)) __kgem_bo_clear_busy(bo); diff --git a/src/sna/kgem.h b/src/sna/kgem.h index 82f9b52..f2b1c98 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -573,6 +573,8 @@ static inline bool kgem_bo_is_snoop(struct kgem_bo *bo) return bo->snoop; } +void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo); + bool __kgem_busy(struct kgem *kgem, int handle); static inline void kgem_bo_mark_busy(struct kgem_bo *bo, int ring) diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c index 540f3a6..14c0d8c 100644 --- a/src/sna/sna_io.c +++ b/src/sna/sna_io.c @@ -1362,38 +1362,25 @@ bool sna_replace(struct sna *sna, { struct kgem_bo *bo = *_bo; struct kgem *kgem = &sna->kgem; - bool busy; void *dst; - busy = __kgem_bo_is_busy(kgem, bo); DBG(("%s(handle=%d, %dx%d, bpp=%d, tiling=%d) busy?=%d\n", __FUNCTION__, bo->handle, pixmap->drawable.width, pixmap->drawable.height, pixmap->drawable.bitsPerPixel, - bo->tiling, busy)); + bo->tiling, + __kgem_bo_is_busy(kgem, bo))); assert(!sna_pixmap(pixmap)->pinned); - if (!busy && upload_inplace__tiled(kgem, bo)) { - BoxRec box; + kgem_bo_undo(kgem, bo); - box.x1 = box.y1 = 0; - box.x2 = pixmap->drawable.width; - box.y2 = pixmap->drawable.height; + if (__kgem_bo_is_busy(kgem, bo)) { + struct kgem_bo *new_bo; - if (write_boxes_inplace__tiled(kgem, src, - stride, pixmap->drawable.bitsPerPixel, 0, 0, - bo, 0, 0, &box, 1)) + if (indirect_replace(sna, pixmap, bo, src, stride)) return true; - } - - if ((busy || !kgem_bo_can_map(kgem, bo)) && - indirect_replace(sna, pixmap, bo, src, stride)) - return true; - - if (busy) { - struct kgem_bo *new_bo; new_bo = kgem_create_2d(kgem, pixmap->drawable.width, commit ef0038d358e613381e03c077e06a87fc49108d87 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Mon Apr 1 22:43:48 2013 +0100 sna: Allow the compiler to inline memcpy for the bitblt routines Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/blt.c b/src/sna/blt.c index 4735d14..af87667 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -138,7 +138,7 @@ xmm_save_128(__m128i *dst, __m128i data) } #endif -void +fast_memcpy void memcpy_blt(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, int16_t src_x, int16_t src_y, @@ -213,7 +213,7 @@ memcpy_blt(const void *src, void *dst, int bpp, } } -void +fast_memcpy void memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, int32_t src_stride, int32_t dst_stride, int16_t src_x, int16_t src_y, diff --git a/src/sna/compiler.h b/src/sna/compiler.h index b5c9ac2..62f51f0 100644 --- a/src/sna/compiler.h +++ b/src/sna/compiler.h @@ -63,6 +63,12 @@ #define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse"))) #endif +#if HAS_GCC(4, 5) && defined(__OPTIMIZE__) +#define fast_memcpy __attribute__((target("inline-all-stringops"))) +#else +#define fast_memcpy +#endif + #ifdef HAVE_VALGRIND #define VG(x) x #else commit 43181692f752f0a552d2e2c76d8379fe16e521cf Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Thu Mar 28 15:41:38 2013 +0000 sna/gen7: Refine is_gt2() for Haswell versus Ivybridge The two similar chipsets do not use the same PCI-ID encoding schema. Fixes regression from commit 235a3981ea9759317b392302a2b2b8f4fafab410 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Tue Mar 26 20:37:14 2013 +0000 sna/gen7: Use GT2 values for GT2 variants Reported-by: zave...@free.fr Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c index e3f80d4..74b0cba 100644 --- a/src/sna/gen7_render.c +++ b/src/sna/gen7_render.c @@ -3687,7 +3687,7 @@ static void gen7_render_fini(struct sna *sna) static bool is_gt2(struct sna *sna) { - return DEVICE_ID(sna->PciInfo) & 0x30; + return DEVICE_ID(sna->PciInfo) & (sna->kgem.gen == 075 ? 0x30 : 0x20); } static bool is_mobile(struct sna *sna) commit 96c10bdff95a3f8a68c6623446655c4c3dbf738a Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Wed Mar 27 22:10:37 2013 +0000 sna/gen7: Resist the temptation to overprogram the number of PS threads for HSW Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c index 773d2f3..e3f80d4 100644 --- a/src/sna/gen7_render.c +++ b/src/sna/gen7_render.c @@ -125,10 +125,10 @@ static const struct gt_info hsw_gt1_info = { }; static const struct gt_info hsw_gt2_info = { - .max_vs_threads = 280, - .max_gs_threads = 280, + .max_vs_threads = 140, + .max_gs_threads = 140, .max_wm_threads = - (204 - 1) << HSW_PS_MAX_THREADS_SHIFT | + (140 - 1) << HSW_PS_MAX_THREADS_SHIFT | 1 << HSW_PS_SAMPLE_MASK_SHIFT, .urb = { 256, 1664, 640 }, }; commit 19dfa72c28c6dc677dbfec3a538d4481985195e5 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Wed Mar 27 16:56:10 2013 +0000 sna/gen4+: Set read-write allocation mode for the target render cache As we often first clear the destination before performing a blend, we get a performance boost if that first write populates the render cache. Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index e40a1b7..1bf5ad2 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -502,7 +502,7 @@ gen4_bind_bo(struct sna *sna, assert(sna->kgem.gen != 040 || !kgem_bo_is_snoop(bo)); /* After the first bind, we manage the cache domains within the batch */ - offset = kgem_bo_get_binding(bo, format); + offset = kgem_bo_get_binding(bo, format | is_dst << 31); if (offset) { if (is_dst) kgem_bo_mark_dirty(bo); @@ -517,9 +517,10 @@ gen4_bind_bo(struct sna *sna, GEN4_SURFACE_BLEND_ENABLED | format << GEN4_SURFACE_FORMAT_SHIFT); - if (is_dst) + if (is_dst) { + ss[0] |= GEN4_SURFACE_RC_READ_WRITE; domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER; - else + } else domains = I915_GEM_DOMAIN_SAMPLER << 16; ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0); @@ -530,7 +531,7 @@ gen4_bind_bo(struct sna *sna, ss[4] = 0; ss[5] = 0; - kgem_bo_set_binding(bo, format, offset); + kgem_bo_set_binding(bo, format | is_dst << 31, offset); DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n", offset, bo->handle, ss[1], diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c index 8b50d22..7038444 100644 --- a/src/sna/gen5_render.c +++ b/src/sna/gen5_render.c @@ -490,7 +490,7 @@ gen5_bind_bo(struct sna *sna, /* After the first bind, we manage the cache domains within the batch */ if (!DBG_NO_SURFACE_CACHE) { - offset = kgem_bo_get_binding(bo, format); + offset = kgem_bo_get_binding(bo, format | is_dst << 31); if (offset) { if (is_dst) kgem_bo_mark_dirty(bo); @@ -506,9 +506,10 @@ gen5_bind_bo(struct sna *sna, GEN5_SURFACE_BLEND_ENABLED | format << GEN5_SURFACE_FORMAT_SHIFT); - if (is_dst) + if (is_dst) { + ss[0] |= GEN5_SURFACE_RC_READ_WRITE; domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER; - else + } else domains = I915_GEM_DOMAIN_SAMPLER << 16; ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0); @@ -519,7 +520,7 @@ gen5_bind_bo(struct sna *sna, ss[4] = 0; ss[5] = 0; - kgem_bo_set_binding(bo, format, offset); + kgem_bo_set_binding(bo, format | is_dst << 31, offset); DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n", offset, bo->handle, ss[1], diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c index 64eccc5..8101faf 100644 --- a/src/sna/gen6_render.c +++ b/src/sna/gen6_render.c @@ -1035,7 +1035,7 @@ gen6_bind_bo(struct sna *sna, uint32_t is_scanout = is_dst && bo->scanout; /* After the first bind, we manage the cache domains within the batch */ - offset = kgem_bo_get_binding(bo, format | is_scanout << 31); + offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31); if (offset) { DBG(("[%x] bo(handle=%d), format=%d, reuse %s binding\n", offset, bo->handle, format, @@ -1051,9 +1051,10 @@ gen6_bind_bo(struct sna *sna, ss[0] = (GEN6_SURFACE_2D << GEN6_SURFACE_TYPE_SHIFT | GEN6_SURFACE_BLEND_ENABLED | format << GEN6_SURFACE_FORMAT_SHIFT); - if (is_dst) + if (is_dst) { + ss[0] |= GEN6_SURFACE_RC_READ_WRITE; domains = I915_GEM_DOMAIN_RENDER << 16 |I915_GEM_DOMAIN_RENDER; - else + } else domains = I915_GEM_DOMAIN_SAMPLER << 16; ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0); ss[2] = ((width - 1) << GEN6_SURFACE_WIDTH_SHIFT | @@ -1064,7 +1065,7 @@ gen6_bind_bo(struct sna *sna, ss[4] = 0; ss[5] = is_scanout ? 0 : 3 << 16; - kgem_bo_set_binding(bo, format | is_scanout << 31, offset); + kgem_bo_set_binding(bo, format | is_dst << 30 | is_scanout << 31, offset); DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n", offset, bo->handle, ss[1], commit d9b8c2039d1be17af8c56364341fc3e10795f200 Author: Chris Wilson <ch...@chris-wilson.co.uk> Date: Wed Mar 27 14:49:15 2013 +0000 sna/gen7: Fix MOCS for Haswell The memory attributes changed slightly, and in particular there is now an explicit uncached setting - which of course happened to be the value currently selected. Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c index 9e40860..773d2f3 100644 --- a/src/sna/gen7_render.c +++ b/src/sna/gen7_render.c @@ -1190,7 +1190,7 @@ gen7_bind_bo(struct sna *sna, COMPILE_TIME_ASSERT(sizeof(struct gen7_surface_state) == 32); /* After the first bind, we manage the cache domains within the batch */ - offset = kgem_bo_get_binding(bo, format | is_scanout << 31); + offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31); if (offset) { if (is_dst) kgem_bo_mark_dirty(bo); -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: http://lists.debian.org/e1upd3u-0003x3...@vasks.debian.org