Userspace currently busywaits for fences to complete; on my workload, this
busywait consumes 10% of the available CPU time.

Provide an ioctl so that userspace can wait for an EOP interrupt that
corresponds to a previous EVENT_WRITE_EOP.

Signed-off-by: Simon Farnsworth <simon.farnswo...@onelan.co.uk>
---
I've been working on top of Jerome's tiling patches, so this doesn't apply
directly on top of current upstream kernels. I can easily rebase to another
version upon request - just point me to a git tree.

My goal is to remove the sched_yield in Mesa's r600_fence_finish given up to
date enough kernel; I hope, though, that the interface is clean enough for
other users to extend it in the future (e.g. using compute rings).

 drivers/gpu/drm/radeon/radeon.h       |    3 +
 drivers/gpu/drm/radeon/radeon_drv.c   |    3 +-
 drivers/gpu/drm/radeon/radeon_fence.c |    2 +
 drivers/gpu/drm/radeon/radeon_gem.c   |   70 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/radeon/radeon_kms.c   |    1 +
 include/drm/radeon_drm.h              |   30 ++++++++++++++
 6 files changed, 108 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 2859406..00c187b 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -217,6 +217,7 @@ struct radeon_fence_driver {
        unsigned long                   last_jiffies;
        unsigned long                   last_timeout;
        wait_queue_head_t               queue;
+       wait_queue_head_t               userspace_queue;
        struct list_head                created;
        struct list_head                emitted;
        struct list_head                signaled;
@@ -1348,6 +1349,8 @@ int radeon_gem_set_tiling_ioctl(struct drm_device *dev, 
void *data,
                                struct drm_file *filp);
 int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
                                struct drm_file *filp);
+int radeon_gem_wait_user_fence_ioctl(struct drm_device *dev, void *data,
+                                    struct drm_file *filp);
 
 /* VRAM scratch page for HDP bug, default vram page */
 struct r600_vram_scratch {
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c 
b/drivers/gpu/drm/radeon/radeon_drv.c
index 4ae2e1d..9f82fa9 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -56,9 +56,10 @@
  *   2.12.0 - RADEON_CS_KEEP_TILING_FLAGS
  *   2.13.0 - virtual memory support
  *   2.14.0 - add evergreen tiling informations
+ *   2.15.0 - gem_wait_user_fence ioctl
  */
 #define KMS_DRIVER_MAJOR       2
-#define KMS_DRIVER_MINOR       14
+#define KMS_DRIVER_MINOR       15
 #define KMS_DRIVER_PATCHLEVEL  0
 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int radeon_driver_unload_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index 64ea3dd..d86bc28 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -356,6 +356,7 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring)
        if (wake) {
                wake_up_all(&rdev->fence_drv[ring].queue);
        }
+       wake_up_interruptible_all(&rdev->fence_drv[ring].userspace_queue);
 }
 
 int radeon_fence_count_emitted(struct radeon_device *rdev, int ring)
@@ -421,6 +422,7 @@ static void radeon_fence_driver_init_ring(struct 
radeon_device *rdev, int ring)
        INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted);
        INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled);
        init_waitqueue_head(&rdev->fence_drv[ring].queue);
+       init_waitqueue_head(&rdev->fence_drv[ring].userspace_queue);
        rdev->fence_drv[ring].initialized = false;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c 
b/drivers/gpu/drm/radeon/radeon_gem.c
index 7337850..602274f 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -531,3 +531,73 @@ int radeon_mode_dumb_destroy(struct drm_file *file_priv,
 {
        return drm_gem_handle_delete(file_priv, handle);
 }
+
+int radeon_gem_wait_user_fence_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *filp)
+{
+       struct drm_radeon_gem_wait_user_fence *args = data;
+       struct radeon_device *rdev = dev->dev_private;
+       struct drm_gem_object *gobj;
+       struct radeon_bo *robj;
+       void *buffer_data;
+       uint32_t *fence_data;
+       int r = 0;
+       long timeout;
+       int ring = RADEON_RING_TYPE_GFX_INDEX;
+
+       /* If you're implementing this for other rings, you'll want to share
+          code with radeon_cs_get_ring in radeon_cs.c */
+       if (args->ring != RADEON_CS_RING_GFX) {
+               return -EINVAL;
+       }
+
+       gobj = drm_gem_object_lookup(dev, filp, args->handle);
+       if (gobj == NULL) {
+               return -ENOENT;
+       }
+       robj = gem_to_radeon_bo(gobj);
+
+       if (gobj->size < args->offset) {
+               r = -EINVAL;
+               goto unreference;
+       }
+
+       r = radeon_bo_reserve(robj, true);
+       if (r) {
+               goto unreference;
+       }
+
+       r = radeon_bo_pin(robj, RADEON_GEM_DOMAIN_GTT, NULL);
+       if (r) {
+               goto unreserve;
+       }
+
+       r = radeon_bo_kmap(robj, &buffer_data);
+       if (r) {
+               goto unpin;
+       }
+
+       radeon_irq_kms_sw_irq_get(rdev, ring);
+
+       fence_data = (uint32_t*)buffer_data;
+       timeout = 
wait_event_interruptible_timeout(rdev->fence_drv[ring].userspace_queue,
+                                                  fence_data[args->offset >> 
2] != args->value,
+                                                  
usecs_to_jiffies(args->timeout_usec));
+
+       radeon_irq_kms_sw_irq_put(rdev, ring);
+
+       if (timeout == 0)
+               r = -ETIMEDOUT;
+       else if (timeout < 0)
+               r = timeout;
+
+       radeon_bo_kunmap(robj);
+unpin:
+       radeon_bo_unpin(robj);
+unreserve:
+       radeon_bo_unreserve(robj);
+unreference:
+       drm_gem_object_unreference_unlocked(gobj);
+
+       return r;
+}
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c 
b/drivers/gpu/drm/radeon/radeon_kms.c
index d335288..0e552cc 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -496,5 +496,6 @@ struct drm_ioctl_desc radeon_ioctls_kms[] = {
        DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, 
DRM_AUTH|DRM_UNLOCKED),
        DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, 
DRM_AUTH|DRM_UNLOCKED),
        DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, 
DRM_AUTH|DRM_UNLOCKED),
+       DRM_IOCTL_DEF_DRV(RADEON_GEM_WAIT_USER_FENCE, 
radeon_gem_wait_user_fence_ioctl, DRM_AUTH|DRM_UNLOCKED),
 };
 int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h
index dd2e9cf..3d4ae93 100644
--- a/include/drm/radeon_drm.h
+++ b/include/drm/radeon_drm.h
@@ -510,6 +510,7 @@ typedef struct {
 #define DRM_RADEON_GEM_GET_TILING      0x29
 #define DRM_RADEON_GEM_BUSY            0x2a
 #define DRM_RADEON_GEM_VA              0x2b
+#define DRM_RADEON_GEM_WAIT_USER_FENCE  0x2c
 
 #define DRM_IOCTL_RADEON_CP_INIT    DRM_IOW( DRM_COMMAND_BASE + 
DRM_RADEON_CP_INIT, drm_radeon_init_t)
 #define DRM_IOCTL_RADEON_CP_START   DRM_IO(  DRM_COMMAND_BASE + 
DRM_RADEON_CP_START)
@@ -552,6 +553,7 @@ typedef struct {
 #define DRM_IOCTL_RADEON_GEM_GET_TILING        DRM_IOWR(DRM_COMMAND_BASE + 
DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling)
 #define DRM_IOCTL_RADEON_GEM_BUSY      DRM_IOWR(DRM_COMMAND_BASE + 
DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
 #define DRM_IOCTL_RADEON_GEM_VA                DRM_IOWR(DRM_COMMAND_BASE + 
DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
+#define DRM_IOCTL_RADEON_GEM_WAIT_USER_FENCE   DRM_IOW(DRM_COMMAND_BASE + 
DRM_RADEON_GEM_WAIT_USER_FENCE, struct drm_radeon_gem_wait_user_fence)
 
 typedef struct drm_radeon_init {
        enum {
@@ -967,4 +969,32 @@ struct drm_radeon_info {
        uint64_t                value;
 };
 
+/**
+ * struct drm_radeon_gem_wait_user_fence - DRM_RADEON_GEM_WAIT_USER_FENCE 
ioctl param
+ *
+ * @handle: Handle for the object that the GPU is expected to write
+ * @ring: The ring on which the fence packet was issued
+ * @offset: Offset (in bytes) within that object where the GPU is expected
+ *          to write. Must be DWORD-aligned
+ * @value: The value expected if the GPU has not yet written to this location
+ * @timeout_usec: The maximum time to wait for the GPU, in microseconds
+ *
+ * The DRM_RADEON_GEM_WAIT_USER_FENCE ioctl is meant to allow userspace to
+ * avoid busy-waiting for a EVENT_WRITE_EOP packet to complete (e.g. for
+ * fence sync objects in OpenGL). It expects the EVENT_WRITE_EOP packet to
+ * have requested an interrupt on completion.
+ *
+ * The ioctl will return immediately if the value supplied is not the value
+ * found in the buffer at offset bytes in; otherwise, it will sleep for up
+ * to timeout_usec, waking up when an EVENT_WRITE_EOP packet causes an
+ * interrupt and the value in the buffer might have changed.
+ */
+struct drm_radeon_gem_wait_user_fence {
+       uint32_t                handle;
+       uint32_t                ring;
+       uint64_t                offset;
+       uint32_t                value;
+       uint64_t                timeout_usec;
+};
+
 #endif
-- 
1.7.6.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to