Apart from some minor comments, patches 1-8 are: Reviewed-by: Marek Olšák <marek.ol...@amd.com>
BTW, I think queries can start using smaller buffers too. If I understand it correctly, the minimum allocation size is 512 bytes, right? radeon_info::min_alloc_size can be added for the queries. Marek On Sun, Sep 18, 2016 at 3:03 PM, Marek Olšák <mar...@gmail.com> wrote: > On Tue, Sep 13, 2016 at 11:56 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: >> From: Nicolai Hähnle <nicolai.haeh...@amd.com> >> >> --- >> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 165 >> ++++++++++++++++++++++++++ >> src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 18 +++ >> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 13 ++ >> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 5 + >> 4 files changed, 201 insertions(+) >> >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> index f581d9b..6a61b30 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> @@ -31,20 +31,27 @@ >> >> #include "amdgpu_cs.h" >> >> #include "os/os_time.h" >> #include "state_tracker/drm_driver.h" >> #include <amdgpu_drm.h> >> #include <xf86drm.h> >> #include <stdio.h> >> #include <inttypes.h> >> >> +static struct pb_buffer * >> +amdgpu_bo_create(struct radeon_winsys *rws, >> + uint64_t size, >> + unsigned alignment, >> + enum radeon_bo_domain domain, >> + enum radeon_bo_flag flags); >> + >> static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, >> enum radeon_bo_usage usage) >> { >> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); >> struct amdgpu_winsys *ws = bo->ws; >> int64_t abs_timeout; >> >> if (timeout == 0) { >> if (p_atomic_read(&bo->num_active_ioctls)) >> return false; >> @@ -436,20 +443,130 @@ bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) >> { >> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); >> >> if (amdgpu_bo_is_referenced_by_any_cs(bo)) { >> return false; >> } >> >> return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); >> } >> >> +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) >> +{ >> + struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */ >> + bo = container_of(entry, bo, u.slab.entry); >> + >> + return amdgpu_bo_can_reclaim(&bo->base); >> +} >> + >> +static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) >> +{ >> + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); >> + >> + assert(!bo->bo); >> + >> + pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry); >> +} >> + >> +static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { >> + amdgpu_bo_slab_destroy >> + /* other functions are never called */ >> +}; >> + >> +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, >> + unsigned entry_size, >> + unsigned group_index) >> +{ >> + struct amdgpu_winsys *ws = priv; >> + struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); >> + enum radeon_bo_domain domains; >> + enum radeon_bo_flag flags = 0; >> + uint32_t base_id; >> + >> + if (!slab) >> + return NULL; >> + >> + if (heap & 1) >> + flags |= RADEON_FLAG_GTT_WC; >> + if (heap & 2) >> + flags |= RADEON_FLAG_CPU_ACCESS; >> + >> + switch (heap >> 2) { >> + case 0: >> + domains = RADEON_DOMAIN_VRAM; >> + break; >> + default: >> + case 1: >> + domains = RADEON_DOMAIN_VRAM_GTT; >> + break; >> + case 2: >> + domains = RADEON_DOMAIN_GTT; >> + break; >> + } >> + >> + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, >> + 64 * 1024, 64 * 1024, >> + domains, flags)); >> + if (!slab->buffer) >> + goto fail; >> + >> + assert(slab->buffer->bo); >> + >> + slab->base.num_entries = slab->buffer->base.size / entry_size; >> + slab->base.num_free = slab->base.num_entries; >> + slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); >> + if (!slab->entries) >> + goto fail_buffer; >> + >> + LIST_INITHEAD(&slab->base.free); >> + >> + base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, >> slab->base.num_entries); >> + >> + for (unsigned i = 0; i < slab->base.num_entries; ++i) { >> + struct amdgpu_winsys_bo *bo = &slab->entries[i]; >> + >> + bo->base.alignment = entry_size; >> + bo->base.usage = slab->buffer->base.usage; >> + bo->base.size = entry_size; >> + bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl; >> + bo->ws = ws; >> + bo->va = slab->buffer->va + i * entry_size; >> + bo->initial_domain = domains; >> + bo->unique_id = base_id + i; >> + bo->u.slab.entry.slab = &slab->base; >> + bo->u.slab.entry.group_index = group_index; >> + bo->u.slab.real = slab->buffer; >> + >> + LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); >> + } >> + >> + return &slab->base; >> + >> +fail_buffer: >> + amdgpu_winsys_bo_reference(&slab->buffer, NULL); >> +fail: >> + FREE(slab); >> + return NULL; >> +} >> + >> +void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) >> +{ >> + struct amdgpu_slab *slab = amdgpu_slab(pslab); >> + >> + for (unsigned i = 0; i < slab->base.num_entries; ++i) >> + amdgpu_bo_remove_fences(&slab->entries[i]); >> + >> + FREE(slab->entries); >> + amdgpu_winsys_bo_reference(&slab->buffer, NULL); >> + FREE(slab); >> +} >> + >> static unsigned eg_tile_split(unsigned tile_split) >> { >> switch (tile_split) { >> case 0: tile_split = 64; break; >> case 1: tile_split = 128; break; >> case 2: tile_split = 256; break; >> case 3: tile_split = 512; break; >> default: >> case 4: tile_split = 1024; break; >> case 5: tile_split = 2048; break; >> @@ -548,20 +665,67 @@ static struct pb_buffer * >> amdgpu_bo_create(struct radeon_winsys *rws, >> uint64_t size, >> unsigned alignment, >> enum radeon_bo_domain domain, >> enum radeon_bo_flag flags) >> { >> struct amdgpu_winsys *ws = amdgpu_winsys(rws); >> struct amdgpu_winsys_bo *bo; >> unsigned usage = 0, pb_cache_bucket; >> >> + /* Sub-allocate small buffers from slabs. */ >> + if (!(flags & RADEON_FLAG_HANDLE) && >> + size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && >> + alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, >> util_next_power_of_two(size))) { >> + struct pb_slab_entry *entry; >> + unsigned heap = 0; >> + > > A comment would useful here to say that the following code filters out > textures. (i.e. HANDLE = textures, INVISIBLE VRAM = tiled textures) > > Marek > >> + if (flags & RADEON_FLAG_GTT_WC) >> + heap |= 1; >> + if (flags & RADEON_FLAG_CPU_ACCESS) >> + heap |= 2; >> + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) >> + goto no_slab; >> + >> + switch (domain) { >> + case RADEON_DOMAIN_VRAM: >> + heap |= 0 * 4; >> + break; >> + case RADEON_DOMAIN_VRAM_GTT: >> + heap |= 1 * 4; >> + break; >> + case RADEON_DOMAIN_GTT: >> + heap |= 2 * 4; >> + break; >> + default: >> + goto no_slab; >> + } >> + >> + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); >> + if (!entry) { >> + /* Clear the cache and try again. */ >> + pb_cache_release_all_buffers(&ws->bo_cache); >> + >> + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); >> + } >> + if (!entry) >> + return NULL; >> + >> + bo = NULL; >> + bo = container_of(entry, bo, u.slab.entry); >> + >> + pipe_reference_init(&bo->base.reference, 1); >> + >> + return &bo->base; >> + } >> +no_slab: >> + >> /* This flag is irrelevant for the cache. */ >> flags &= ~RADEON_FLAG_HANDLE; >> >> /* Align size to page size. This is the minimum alignment for normal >> * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, >> * like constant/uniform buffers, can benefit from better and more reuse. >> */ >> size = align64(size, ws->info.gart_page_size); >> alignment = align(alignment, ws->info.gart_page_size); >> >> @@ -590,20 +754,21 @@ amdgpu_bo_create(struct radeon_winsys *rws, >> pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, >> pb_cache_bucket); >> if (bo) >> return &bo->base; >> >> /* Create a new one. */ >> bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, >> pb_cache_bucket); >> if (!bo) { >> /* Clear the cache and try again. */ >> + pb_slabs_reclaim(&ws->bo_slabs); >> pb_cache_release_all_buffers(&ws->bo_cache); >> bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, >> pb_cache_bucket); >> if (!bo) >> return NULL; >> } >> >> bo->u.real.use_reusable_pool = true; >> return &bo->base; >> } >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> index e5b5cf5..1e25897 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> @@ -74,28 +74,46 @@ struct amdgpu_winsys_bo { >> * it can only transition from false to true >> */ >> volatile int is_shared; /* bool (int for atomicity) */ >> >> /* Fences for buffer synchronization. */ >> unsigned num_fences; >> unsigned max_fences; >> struct pipe_fence_handle **fences; >> }; >> >> +struct amdgpu_slab { >> + struct pb_slab base; >> + struct amdgpu_winsys_bo *buffer; >> + struct amdgpu_winsys_bo *entries; >> +}; >> + >> bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf); >> void amdgpu_bo_destroy(struct pb_buffer *_buf); >> void amdgpu_bo_init_functions(struct amdgpu_winsys *ws); >> >> +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry); >> +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, >> + unsigned entry_size, >> + unsigned group_index); >> +void amdgpu_bo_slab_free(void *priv, struct pb_slab *slab); >> + >> static inline >> struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) >> { >> return (struct amdgpu_winsys_bo *)bo; >> } >> >> static inline >> +struct amdgpu_slab *amdgpu_slab(struct pb_slab *slab) >> +{ >> + return (struct amdgpu_slab *)slab; >> +} >> + >> +static inline >> void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst, >> struct amdgpu_winsys_bo *src) >> { >> pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src); >> } >> >> #endif >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> index 3961ee3..c83489d 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> @@ -375,20 +375,21 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws) >> } >> >> static void amdgpu_winsys_destroy(struct radeon_winsys *rws) >> { >> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; >> >> if (util_queue_is_initialized(&ws->cs_queue)) >> util_queue_destroy(&ws->cs_queue); >> >> pipe_mutex_destroy(ws->bo_fence_lock); >> + pb_slabs_deinit(&ws->bo_slabs); >> pb_cache_deinit(&ws->bo_cache); >> pipe_mutex_destroy(ws->global_bo_list_lock); >> do_winsys_deinit(ws); >> FREE(rws); >> } >> >> static void amdgpu_winsys_query_info(struct radeon_winsys *rws, >> struct radeon_info *info) >> { >> *info = ((struct amdgpu_winsys *)rws)->info; >> @@ -540,20 +541,29 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t >> screen_create) >> ws->info.drm_minor = drm_minor; >> >> if (!do_winsys_init(ws, fd)) >> goto fail_alloc; >> >> /* Create managers. */ >> pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0, >> (ws->info.vram_size + ws->info.gart_size) / 8, >> amdgpu_bo_destroy, amdgpu_bo_can_reclaim); >> >> + if (!pb_slabs_init(&ws->bo_slabs, >> + AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, >> + 12, /* number of heaps (domain/flags combinations) */ >> + ws, >> + amdgpu_bo_can_reclaim_slab, >> + amdgpu_bo_slab_alloc, >> + amdgpu_bo_slab_free)) >> + goto fail_cache; >> + >> /* init reference */ >> pipe_reference_init(&ws->reference, 1); >> >> /* Set functions. */ >> ws->base.unref = amdgpu_winsys_unref; >> ws->base.destroy = amdgpu_winsys_destroy; >> ws->base.query_info = amdgpu_winsys_query_info; >> ws->base.cs_request_feature = amdgpu_cs_request_feature; >> ws->base.query_value = amdgpu_query_value; >> ws->base.read_registers = amdgpu_read_registers; >> @@ -583,16 +593,19 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t >> screen_create) >> >> util_hash_table_set(dev_tab, dev, ws); >> >> /* We must unlock the mutex once the winsys is fully initialized, so that >> * other threads attempting to create the winsys from the same fd will >> * get a fully initialized winsys and not just half-way initialized. */ >> pipe_mutex_unlock(dev_tab_mutex); >> >> return &ws->base; >> >> +fail_cache: >> + pb_cache_deinit(&ws->bo_cache); >> + do_winsys_deinit(ws); >> fail_alloc: >> FREE(ws); >> fail: >> pipe_mutex_unlock(dev_tab_mutex); >> return NULL; >> } >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h >> b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h >> index 96d4e6d..69c6638 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h >> @@ -26,31 +26,36 @@ >> */ >> /* >> * Authors: >> * Marek Olšák <mar...@gmail.com> >> */ >> >> #ifndef AMDGPU_WINSYS_H >> #define AMDGPU_WINSYS_H >> >> #include "pipebuffer/pb_cache.h" >> +#include "pipebuffer/pb_slab.h" >> #include "gallium/drivers/radeon/radeon_winsys.h" >> #include "addrlib/addrinterface.h" >> #include "util/u_queue.h" >> #include <amdgpu.h> >> >> struct amdgpu_cs; >> >> +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 >> +#define AMDGPU_SLAB_MAX_SIZE_LOG2 14 >> + >> struct amdgpu_winsys { >> struct radeon_winsys base; >> struct pipe_reference reference; >> struct pb_cache bo_cache; >> + struct pb_slabs bo_slabs; >> >> amdgpu_device_handle dev; >> >> pipe_mutex bo_fence_lock; >> >> int num_cs; /* The number of command streams created. */ >> uint32_t next_bo_unique_id; >> uint64_t allocated_vram; >> uint64_t allocated_gtt; >> uint64_t mapped_vram; >> -- >> 2.7.4 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev