From: Nicolai Hähnle <nicolai.haeh...@amd.com> Only enable for chips with GPUVM, because older driver paths do not take the required offset into account. --- src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 170 ++++++++++++++++++++++ src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 12 ++ src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 24 ++- src/gallium/winsys/radeon/drm/radeon_drm_winsys.h | 5 + 4 files changed, 209 insertions(+), 2 deletions(-)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 4b7dbdc..2e7635e 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -35,20 +35,27 @@ #include "state_tracker/drm_driver.h" #include <sys/ioctl.h> #include <xf86drm.h> #include <errno.h> #include <fcntl.h> #include <stdio.h> #include <inttypes.h> +static struct pb_buffer * +radeon_winsys_bo_create(struct radeon_winsys *rws, + uint64_t size, + unsigned alignment, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags); + static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo) { return (struct radeon_bo *)bo; } struct radeon_bo_va_hole { struct list_head list; uint64_t offset; uint64_t size; }; @@ -693,20 +700,134 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws, bool radeon_bo_can_reclaim(struct pb_buffer *_buf) { struct radeon_bo *bo = radeon_bo(_buf); if (radeon_bo_is_referenced_by_any_cs(bo)) return false; return radeon_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); } +bool radeon_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) +{ + struct radeon_bo *bo = NULL; /* fix container_of */ + bo = container_of(entry, bo, u.slab.entry); + + return radeon_bo_can_reclaim(&bo->base); +} + +static void radeon_bo_slab_destroy(struct pb_buffer *_buf) +{ + struct radeon_bo *bo = radeon_bo(_buf); + + assert(!bo->handle); + + pb_slab_free(&bo->rws->bo_slabs, &bo->u.slab.entry); +} + +static const struct pb_vtbl radeon_winsys_bo_slab_vtbl = { + radeon_bo_slab_destroy + /* other functions are never called */ +}; + +struct pb_slab *radeon_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index) +{ + struct radeon_drm_winsys *ws = priv; + struct radeon_slab *slab = CALLOC_STRUCT(radeon_slab); + enum radeon_bo_domain domains; + enum radeon_bo_flag flags = 0; + unsigned base_hash; + + if (!slab) + return NULL; + + if (heap & 1) + flags |= RADEON_FLAG_GTT_WC; + if (heap & 2) + flags |= RADEON_FLAG_CPU_ACCESS; + + switch (heap >> 2) { + case 0: + domains = RADEON_DOMAIN_VRAM; + break; + default: + case 1: + domains = RADEON_DOMAIN_VRAM_GTT; + break; + case 2: + domains = RADEON_DOMAIN_GTT; + break; + } + + slab->buffer = radeon_bo(radeon_winsys_bo_create(&ws->base, + 64 * 1024, 64 * 1024, + domains, flags)); + if (!slab->buffer) + goto fail; + + assert(slab->buffer->handle); + + slab->base.num_entries = slab->buffer->base.size / entry_size; + slab->base.num_free = slab->base.num_entries; + slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); + if (!slab->entries) + goto fail_buffer; + + LIST_INITHEAD(&slab->base.free); + + base_hash = __sync_fetch_and_add(&ws->next_bo_hash, slab->base.num_entries); + + for (unsigned i = 0; i < slab->base.num_entries; ++i) { + struct radeon_bo *bo = &slab->entries[i]; + + bo->base.alignment = entry_size; + bo->base.usage = slab->buffer->base.usage; + bo->base.size = entry_size; + bo->base.vtbl = &radeon_winsys_bo_slab_vtbl; + bo->rws = ws; + bo->va = slab->buffer->va + i * entry_size; + bo->initial_domain = domains; + bo->hash = base_hash + i; + bo->u.slab.entry.slab = &slab->base; + bo->u.slab.entry.group_index = group_index; + bo->u.slab.real = slab->buffer; + + LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); + } + + return &slab->base; + +fail_buffer: + radeon_bo_reference(&slab->buffer, NULL); +fail: + FREE(slab); + return NULL; +} + +void radeon_bo_slab_free(void *priv, struct pb_slab *pslab) +{ + struct radeon_slab *slab = (struct radeon_slab *)pslab; + + for (unsigned i = 0; i < slab->base.num_entries; ++i) { + struct radeon_bo *bo = &slab->entries[i]; + for (unsigned j = 0; j < bo->u.slab.num_fences; ++j) + radeon_bo_reference(&bo->u.slab.fences[j], NULL); + FREE(bo->u.slab.fences); + } + + FREE(slab->entries); + radeon_bo_reference(&slab->buffer, NULL); + FREE(slab); +} + static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { case 0: tile_split = 64; break; case 1: tile_split = 128; break; case 2: tile_split = 256; break; case 3: tile_split = 512; break; default: case 4: tile_split = 1024; break; case 5: tile_split = 2048; break; @@ -816,20 +937,68 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, enum radeon_bo_flag flags) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct radeon_bo *bo; unsigned usage = 0, pb_cache_bucket; /* Only 32-bit sizes are supported. */ if (size > UINT_MAX) return NULL; + /* Sub-allocate small buffers from slabs. */ + if (!(flags & RADEON_FLAG_HANDLE) && + size <= (1 << RADEON_SLAB_MAX_SIZE_LOG2) && + ws->info.has_virtual_memory && + alignment <= MAX2(1 << RADEON_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { + struct pb_slab_entry *entry; + unsigned heap = 0; + + if (flags & RADEON_FLAG_GTT_WC) + heap |= 1; + if (flags & RADEON_FLAG_CPU_ACCESS) + heap |= 2; + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) + goto no_slab; + + switch (domain) { + case RADEON_DOMAIN_VRAM: + heap |= 0 * 4; + break; + case RADEON_DOMAIN_VRAM_GTT: + heap |= 1 * 4; + break; + case RADEON_DOMAIN_GTT: + heap |= 2 * 4; + break; + default: + goto no_slab; + } + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + if (!entry) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&ws->bo_cache); + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + } + if (!entry) + return NULL; + + bo = NULL; + bo = container_of(entry, bo, u.slab.entry); + + pipe_reference_init(&bo->base.reference, 1); + + return &bo->base; + } +no_slab: + /* This flag is irrelevant for the cache. */ flags &= ~RADEON_FLAG_HANDLE; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ size = align(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); @@ -855,20 +1024,21 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, pb_cache_bucket)); if (bo) return &bo->base; bo = radeon_create_bo(ws, size, alignment, usage, domain, flags, pb_cache_bucket); if (!bo) { /* Clear the cache and try again. */ + pb_slabs_reclaim(&ws->bo_slabs); pb_cache_release_all_buffers(&ws->bo_cache); bo = radeon_create_bo(ws, size, alignment, usage, domain, flags, pb_cache_bucket); if (!bo) return NULL; } bo->u.real.use_reusable_pool = true; pipe_mutex_lock(ws->bo_handles_mutex); diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h index 8f767fd..236e94c 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h @@ -67,23 +67,35 @@ struct radeon_bo { enum radeon_bo_domain initial_domain; /* how many command streams is this bo referenced in? */ int num_cs_references; /* how many command streams, which are being emitted in a separate * thread, is this bo referenced in? */ int num_active_ioctls; }; +struct radeon_slab { + struct pb_slab base; + struct radeon_bo *buffer; + struct radeon_bo *entries; +}; + void radeon_bo_destroy(struct pb_buffer *_buf); bool radeon_bo_can_reclaim(struct pb_buffer *_buf); void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws); +bool radeon_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry); +struct pb_slab *radeon_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index); +void radeon_bo_slab_free(void *priv, struct pb_slab *slab); + static inline void radeon_bo_reference(struct radeon_bo **dst, struct radeon_bo *src) { pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src); } void *radeon_bo_do_map(struct radeon_bo *bo); #endif diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index e02f286..ae55746 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -538,20 +538,22 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) static void radeon_winsys_destroy(struct radeon_winsys *rws) { struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)rws; if (util_queue_is_initialized(&ws->cs_queue)) util_queue_destroy(&ws->cs_queue); pipe_mutex_destroy(ws->hyperz_owner_mutex); pipe_mutex_destroy(ws->cmask_owner_mutex); + if (ws->info.has_virtual_memory) + pb_slabs_deinit(&ws->bo_slabs); pb_cache_deinit(&ws->bo_cache); if (ws->gen >= DRV_R600) { radeon_surface_manager_free(ws->surf_man); } util_hash_table_destroy(ws->bo_names); util_hash_table_destroy(ws->bo_handles); util_hash_table_destroy(ws->bo_vas); pipe_mutex_destroy(ws->bo_handles_mutex); @@ -752,24 +754,39 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) ws->fd = dup(fd); if (!do_winsys_init(ws)) goto fail1; pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0, MIN2(ws->info.vram_size, ws->info.gart_size), radeon_bo_destroy, radeon_bo_can_reclaim); + if (ws->info.has_virtual_memory) { + /* There is no fundamental obstacle to using slab buffer allocation + * without GPUVM, but enabling it requires making sure that the drivers + * honor the address offset. + */ + if (!pb_slabs_init(&ws->bo_slabs, + RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2, + 12, + ws, + radeon_bo_can_reclaim_slab, + radeon_bo_slab_alloc, + radeon_bo_slab_free)) + goto fail_cache; + } + if (ws->gen >= DRV_R600) { ws->surf_man = radeon_surface_manager_new(ws->fd); if (!ws->surf_man) - goto fail; + goto fail_slab; } /* init reference */ pipe_reference_init(&ws->reference, 1); /* Set functions. */ ws->base.unref = radeon_winsys_unref; ws->base.destroy = radeon_winsys_destroy; ws->base.query_info = radeon_query_info; ws->base.cs_request_feature = radeon_cs_request_feature; @@ -812,21 +829,24 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) util_hash_table_set(fd_tab, intptr_to_pointer(ws->fd), ws); /* We must unlock the mutex once the winsys is fully initialized, so that * other threads attempting to create the winsys from the same fd will * get a fully initialized winsys and not just half-way initialized. */ pipe_mutex_unlock(fd_tab_mutex); return &ws->base; -fail: +fail_slab: + if (ws->info.has_virtual_memory) + pb_slabs_deinit(&ws->bo_slabs); +fail_cache: pb_cache_deinit(&ws->bo_cache); fail1: pipe_mutex_unlock(fd_tab_mutex); if (ws->surf_man) radeon_surface_manager_free(ws->surf_man); if (ws->fd >= 0) close(ws->fd); FREE(ws); return NULL; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h index b30055c..934cd58 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h @@ -25,20 +25,21 @@ */ /* * Authors: * Corbin Simpson <mostawesomed...@gmail.com> */ #ifndef RADEON_DRM_WINSYS_H #define RADEON_DRM_WINSYS_H #include "gallium/drivers/radeon/radeon_winsys.h" #include "pipebuffer/pb_cache.h" +#include "pipebuffer/pb_slab.h" #include "util/u_queue.h" #include "util/list.h" #include <radeon_drm.h> #ifndef DRM_RADEON_GEM_USERPTR #define DRM_RADEON_GEM_USERPTR 0x2d #define RADEON_GEM_USERPTR_READONLY (1 << 0) #define RADEON_GEM_USERPTR_ANONONLY (1 << 1) @@ -55,24 +56,28 @@ struct drm_radeon_gem_userptr { #endif struct radeon_drm_cs; enum radeon_generation { DRV_R300, DRV_R600, DRV_SI }; +#define RADEON_SLAB_MIN_SIZE_LOG2 9 +#define RADEON_SLAB_MAX_SIZE_LOG2 14 + struct radeon_drm_winsys { struct radeon_winsys base; struct pipe_reference reference; struct pb_cache bo_cache; + struct pb_slabs bo_slabs; int fd; /* DRM file descriptor */ int num_cs; /* The number of command streams created. */ uint64_t allocated_vram; uint64_t allocated_gtt; uint64_t mapped_vram; uint64_t mapped_gtt; uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */ uint64_t num_cs_flushes; uint32_t next_bo_hash; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev