From: Nicolai Hähnle <nicolai.haeh...@amd.com> --- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 165 ++++++++++++++++++++++++++ src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 18 +++ src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 13 ++ src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 5 + 4 files changed, 201 insertions(+)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index f581d9b..6a61b30 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -31,20 +31,27 @@ #include "amdgpu_cs.h" #include "os/os_time.h" #include "state_tracker/drm_driver.h" #include <amdgpu_drm.h> #include <xf86drm.h> #include <stdio.h> #include <inttypes.h> +static struct pb_buffer * +amdgpu_bo_create(struct radeon_winsys *rws, + uint64_t size, + unsigned alignment, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags); + static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, enum radeon_bo_usage usage) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_winsys *ws = bo->ws; int64_t abs_timeout; if (timeout == 0) { if (p_atomic_read(&bo->num_active_ioctls)) return false; @@ -436,20 +443,130 @@ bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); if (amdgpu_bo_is_referenced_by_any_cs(bo)) { return false; } return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); } +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) +{ + struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */ + bo = container_of(entry, bo, u.slab.entry); + + return amdgpu_bo_can_reclaim(&bo->base); +} + +static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + + assert(!bo->bo); + + pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry); +} + +static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { + amdgpu_bo_slab_destroy + /* other functions are never called */ +}; + +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index) +{ + struct amdgpu_winsys *ws = priv; + struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); + enum radeon_bo_domain domains; + enum radeon_bo_flag flags = 0; + uint32_t base_id; + + if (!slab) + return NULL; + + if (heap & 1) + flags |= RADEON_FLAG_GTT_WC; + if (heap & 2) + flags |= RADEON_FLAG_CPU_ACCESS; + + switch (heap >> 2) { + case 0: + domains = RADEON_DOMAIN_VRAM; + break; + default: + case 1: + domains = RADEON_DOMAIN_VRAM_GTT; + break; + case 2: + domains = RADEON_DOMAIN_GTT; + break; + } + + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, + 64 * 1024, 64 * 1024, + domains, flags)); + if (!slab->buffer) + goto fail; + + assert(slab->buffer->bo); + + slab->base.num_entries = slab->buffer->base.size / entry_size; + slab->base.num_free = slab->base.num_entries; + slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); + if (!slab->entries) + goto fail_buffer; + + LIST_INITHEAD(&slab->base.free); + + base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries); + + for (unsigned i = 0; i < slab->base.num_entries; ++i) { + struct amdgpu_winsys_bo *bo = &slab->entries[i]; + + bo->base.alignment = entry_size; + bo->base.usage = slab->buffer->base.usage; + bo->base.size = entry_size; + bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl; + bo->ws = ws; + bo->va = slab->buffer->va + i * entry_size; + bo->initial_domain = domains; + bo->unique_id = base_id + i; + bo->u.slab.entry.slab = &slab->base; + bo->u.slab.entry.group_index = group_index; + bo->u.slab.real = slab->buffer; + + LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); + } + + return &slab->base; + +fail_buffer: + amdgpu_winsys_bo_reference(&slab->buffer, NULL); +fail: + FREE(slab); + return NULL; +} + +void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) +{ + struct amdgpu_slab *slab = amdgpu_slab(pslab); + + for (unsigned i = 0; i < slab->base.num_entries; ++i) + amdgpu_bo_remove_fences(&slab->entries[i]); + + FREE(slab->entries); + amdgpu_winsys_bo_reference(&slab->buffer, NULL); + FREE(slab); +} + static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { case 0: tile_split = 64; break; case 1: tile_split = 128; break; case 2: tile_split = 256; break; case 3: tile_split = 512; break; default: case 4: tile_split = 1024; break; case 5: tile_split = 2048; break; @@ -548,20 +665,67 @@ static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, uint64_t size, unsigned alignment, enum radeon_bo_domain domain, enum radeon_bo_flag flags) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo; unsigned usage = 0, pb_cache_bucket; + /* Sub-allocate small buffers from slabs. */ + if (!(flags & RADEON_FLAG_HANDLE) && + size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && + alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { + struct pb_slab_entry *entry; + unsigned heap = 0; + + if (flags & RADEON_FLAG_GTT_WC) + heap |= 1; + if (flags & RADEON_FLAG_CPU_ACCESS) + heap |= 2; + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) + goto no_slab; + + switch (domain) { + case RADEON_DOMAIN_VRAM: + heap |= 0 * 4; + break; + case RADEON_DOMAIN_VRAM_GTT: + heap |= 1 * 4; + break; + case RADEON_DOMAIN_GTT: + heap |= 2 * 4; + break; + default: + goto no_slab; + } + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + if (!entry) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&ws->bo_cache); + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + } + if (!entry) + return NULL; + + bo = NULL; + bo = container_of(entry, bo, u.slab.entry); + + pipe_reference_init(&bo->base.reference, 1); + + return &bo->base; + } +no_slab: + /* This flag is irrelevant for the cache. */ flags &= ~RADEON_FLAG_HANDLE; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ size = align64(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); @@ -590,20 +754,21 @@ amdgpu_bo_create(struct radeon_winsys *rws, pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, pb_cache_bucket); if (bo) return &bo->base; /* Create a new one. */ bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, pb_cache_bucket); if (!bo) { /* Clear the cache and try again. */ + pb_slabs_reclaim(&ws->bo_slabs); pb_cache_release_all_buffers(&ws->bo_cache); bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, pb_cache_bucket); if (!bo) return NULL; } bo->u.real.use_reusable_pool = true; return &bo->base; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index e5b5cf5..1e25897 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -74,28 +74,46 @@ struct amdgpu_winsys_bo { * it can only transition from false to true */ volatile int is_shared; /* bool (int for atomicity) */ /* Fences for buffer synchronization. */ unsigned num_fences; unsigned max_fences; struct pipe_fence_handle **fences; }; +struct amdgpu_slab { + struct pb_slab base; + struct amdgpu_winsys_bo *buffer; + struct amdgpu_winsys_bo *entries; +}; + bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf); void amdgpu_bo_destroy(struct pb_buffer *_buf); void amdgpu_bo_init_functions(struct amdgpu_winsys *ws); +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry); +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index); +void amdgpu_bo_slab_free(void *priv, struct pb_slab *slab); + static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) { return (struct amdgpu_winsys_bo *)bo; } static inline +struct amdgpu_slab *amdgpu_slab(struct pb_slab *slab) +{ + return (struct amdgpu_slab *)slab; +} + +static inline void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst, struct amdgpu_winsys_bo *src) { pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src); } #endif diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 3961ee3..c83489d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -375,20 +375,21 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws) } static void amdgpu_winsys_destroy(struct radeon_winsys *rws) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; if (util_queue_is_initialized(&ws->cs_queue)) util_queue_destroy(&ws->cs_queue); pipe_mutex_destroy(ws->bo_fence_lock); + pb_slabs_deinit(&ws->bo_slabs); pb_cache_deinit(&ws->bo_cache); pipe_mutex_destroy(ws->global_bo_list_lock); do_winsys_deinit(ws); FREE(rws); } static void amdgpu_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *info) { *info = ((struct amdgpu_winsys *)rws)->info; @@ -540,20 +541,29 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws->info.drm_minor = drm_minor; if (!do_winsys_init(ws, fd)) goto fail_alloc; /* Create managers. */ pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0, (ws->info.vram_size + ws->info.gart_size) / 8, amdgpu_bo_destroy, amdgpu_bo_can_reclaim); + if (!pb_slabs_init(&ws->bo_slabs, + AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, + 12, /* number of heaps (domain/flags combinations) */ + ws, + amdgpu_bo_can_reclaim_slab, + amdgpu_bo_slab_alloc, + amdgpu_bo_slab_free)) + goto fail_cache; + /* init reference */ pipe_reference_init(&ws->reference, 1); /* Set functions. */ ws->base.unref = amdgpu_winsys_unref; ws->base.destroy = amdgpu_winsys_destroy; ws->base.query_info = amdgpu_winsys_query_info; ws->base.cs_request_feature = amdgpu_cs_request_feature; ws->base.query_value = amdgpu_query_value; ws->base.read_registers = amdgpu_read_registers; @@ -583,16 +593,19 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) util_hash_table_set(dev_tab, dev, ws); /* We must unlock the mutex once the winsys is fully initialized, so that * other threads attempting to create the winsys from the same fd will * get a fully initialized winsys and not just half-way initialized. */ pipe_mutex_unlock(dev_tab_mutex); return &ws->base; +fail_cache: + pb_cache_deinit(&ws->bo_cache); + do_winsys_deinit(ws); fail_alloc: FREE(ws); fail: pipe_mutex_unlock(dev_tab_mutex); return NULL; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 96d4e6d..69c6638 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -26,31 +26,36 @@ */ /* * Authors: * Marek Olšák <mar...@gmail.com> */ #ifndef AMDGPU_WINSYS_H #define AMDGPU_WINSYS_H #include "pipebuffer/pb_cache.h" +#include "pipebuffer/pb_slab.h" #include "gallium/drivers/radeon/radeon_winsys.h" #include "addrlib/addrinterface.h" #include "util/u_queue.h" #include <amdgpu.h> struct amdgpu_cs; +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 +#define AMDGPU_SLAB_MAX_SIZE_LOG2 14 + struct amdgpu_winsys { struct radeon_winsys base; struct pipe_reference reference; struct pb_cache bo_cache; + struct pb_slabs bo_slabs; amdgpu_device_handle dev; pipe_mutex bo_fence_lock; int num_cs; /* The number of command streams created. */ uint32_t next_bo_unique_id; uint64_t allocated_vram; uint64_t allocated_gtt; uint64_t mapped_vram; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev