From: Nicolai Hähnle <nicolai.haeh...@amd.com> Introducing radeon_bo::hash will reduce collisions between "real" buffers and buffers from slabs. --- src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 3 + src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 1 + src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 98 ++++++++++++++++++++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 16 +++- src/gallium/winsys/radeon/drm/radeon_drm_winsys.h | 1 + 5 files changed, 107 insertions(+), 12 deletions(-)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index df6e53c..1725080 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -580,20 +580,21 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws, pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = alignment; bo->base.usage = usage; bo->base.size = size; bo->base.vtbl = &radeon_bo_vtbl; bo->rws = rws; bo->handle = args.handle; bo->va = 0; bo->initial_domain = initial_domains; + bo->hash = __sync_fetch_and_add(&rws->next_bo_hash, 1); pipe_mutex_init(bo->u.real.map_mutex); pb_cache_init_entry(&rws->bo_cache, &bo->u.real.cache_entry, &bo->base, pb_cache_bucket); if (rws->info.has_virtual_memory) { struct drm_radeon_gem_va va; unsigned va_gap_size; va_gap_size = rws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; bo->va = radeon_bomgr_find_va(rws, size + va_gap_size, alignment); @@ -857,20 +858,21 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, /* Initialize it. */ pipe_reference_init(&bo->base.reference, 1); bo->handle = args.handle; bo->base.alignment = 0; bo->base.size = size; bo->base.vtbl = &radeon_bo_vtbl; bo->rws = ws; bo->user_ptr = pointer; bo->va = 0; bo->initial_domain = RADEON_DOMAIN_GTT; + bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1); pipe_mutex_init(bo->u.real.map_mutex); util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); pipe_mutex_unlock(ws->bo_handles_mutex); if (ws->info.has_virtual_memory) { struct drm_radeon_gem_va va; bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20); @@ -990,20 +992,21 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, bo->handle = handle; /* Initialize it. */ pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = 0; bo->base.size = (unsigned) size; bo->base.vtbl = &radeon_bo_vtbl; bo->rws = ws; bo->va = 0; + bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1); pipe_mutex_init(bo->u.real.map_mutex); if (bo->flink_name) util_hash_table_set(ws->bo_names, (void*)(uintptr_t)bo->flink_name, bo); util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); done: pipe_mutex_unlock(ws->bo_handles_mutex); diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h index b9a4a05..8e35a38 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h @@ -52,20 +52,21 @@ struct radeon_bo { struct radeon_bo *real; } slab; } u; struct radeon_drm_winsys *rws; void *user_ptr; /* from buffer_from_ptr */ uint32_t handle; /* 0 for slab entries */ uint32_t flink_name; uint64_t va; + uint32_t hash; enum radeon_bo_domain initial_domain; /* how many command streams is this bo referenced in? */ int num_cs_references; /* how many command streams, which are being emitted in a separate * thread, is this bo referenced in? */ int num_active_ioctls; }; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 20f90cf..9fbd378 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -122,34 +122,40 @@ static bool radeon_init_cs_context(struct radeon_cs_context *csc, } static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) { unsigned i; for (i = 0; i < csc->num_relocs; i++) { p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references); radeon_bo_reference(&csc->relocs_bo[i].bo, NULL); } + for (i = 0; i < csc->num_slab_buffers; ++i) { + p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references); + radeon_bo_reference(&csc->slab_buffers[i].bo, NULL); + } csc->num_relocs = 0; csc->num_validated_relocs = 0; + csc->num_slab_buffers = 0; csc->chunks[0].length_dw = 0; csc->chunks[1].length_dw = 0; for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) { csc->reloc_indices_hashlist[i] = -1; } } static void radeon_destroy_cs_context(struct radeon_cs_context *csc) { radeon_cs_context_cleanup(csc); + FREE(csc->slab_buffers); FREE(csc->relocs_bo); FREE(csc->relocs); } static struct radeon_winsys_cs * radeon_drm_cs_create(struct radeon_winsys_ctx *ctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), @@ -184,52 +190,62 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx, cs->base.current.buf = cs->csc->buf; cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf); cs->ring_type = ring_type; p_atomic_inc(&ws->num_cs); return &cs->base; } int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo) { - unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); + unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); + struct radeon_bo_item *buffers; + unsigned num_buffers; int i = csc->reloc_indices_hashlist[hash]; + if (bo->handle) { + buffers = csc->relocs_bo; + num_buffers = csc->num_relocs; + } else { + buffers = csc->slab_buffers; + num_buffers = csc->num_slab_buffers; + } + /* not found or found */ - if (i == -1 || csc->relocs_bo[i].bo == bo) + if (i == -1 || (i < num_buffers && buffers[i].bo == bo)) return i; /* Hash collision, look for the BO in the list of relocs linearly. */ - for (i = csc->num_relocs - 1; i >= 0; i--) { - if (csc->relocs_bo[i].bo == bo) { + for (i = num_buffers - 1; i >= 0; i--) { + if (buffers[i].bo == bo) { /* Put this reloc in the hash list. * This will prevent additional hash collisions if there are * several consecutive lookup_buffer calls for the same buffer. * * Example: Assuming buffers A,B,C collide in the hash list, * the following sequence of relocs: * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC * will collide here: ^ and here: ^, * meaning that we should get very few collisions in the end. */ csc->reloc_indices_hashlist[hash] = i; return i; } } return -1; } -static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs, - struct radeon_bo *bo) +static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs, + struct radeon_bo *bo) { struct radeon_cs_context *csc = cs->csc; struct drm_radeon_cs_reloc *reloc; - unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); + unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); int i = -1; i = radeon_lookup_buffer(csc, bo); if (i >= 0) { /* For async DMA, every add_buffer call must add a buffer to the list * no matter how many duplicates there are. This is due to the fact * the DMA CS checker doesn't use NOP packets for offset patching, * but always uses the i-th buffer from the list to patch the i-th * offset. If there are N offsets in a DMA CS, there must also be N @@ -252,56 +268,113 @@ static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs, csc->relocs_bo = realloc(csc->relocs_bo, size); size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc); csc->relocs = realloc(csc->relocs, size); csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ csc->relocs_bo[csc->num_relocs].bo = NULL; - csc->relocs_bo[csc->num_relocs].priority_usage = 0; + csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0; radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo); p_atomic_inc(&bo->num_cs_references); reloc = &csc->relocs[csc->num_relocs]; reloc->handle = bo->handle; reloc->read_domains = 0; reloc->write_domain = 0; reloc->flags = 0; csc->reloc_indices_hashlist[hash] = csc->num_relocs; csc->chunks[1].length_dw += RELOC_DWORDS; return csc->num_relocs++; } +static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs, + struct radeon_bo *bo) +{ + struct radeon_cs_context *csc = cs->csc; + unsigned hash; + struct radeon_bo_item *item; + int idx; + int real_idx; + + idx = radeon_lookup_buffer(csc, bo); + if (idx >= 0) + return idx; + + real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real); + + /* Check if the backing array is large enough. */ + if (csc->num_slab_buffers >= csc->max_slab_buffers) { + unsigned new_max = MAX2(csc->max_slab_buffers + 16, + (unsigned)(csc->max_slab_buffers * 1.3)); + struct radeon_bo_item *new_buffers = + REALLOC(csc->slab_buffers, + csc->max_slab_buffers * sizeof(*new_buffers), + new_max * sizeof(*new_buffers)); + if (!new_buffers) { + fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n"); + return -1; + } + + csc->max_slab_buffers = new_max; + csc->slab_buffers = new_buffers; + } + + /* Initialize the new relocation. */ + idx = csc->num_slab_buffers++; + item = &csc->slab_buffers[idx]; + + item->bo = NULL; + item->u.slab.real_idx = real_idx; + radeon_bo_reference(&item->bo, bo); + p_atomic_inc(&bo->num_cs_references); + + hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1); + csc->reloc_indices_hashlist[hash] = idx; + + return idx; +} + static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct radeon_bo *bo = (struct radeon_bo*)buf; enum radeon_bo_domain added_domains; enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; struct drm_radeon_cs_reloc *reloc; - unsigned index = radeon_lookup_or_add_buffer(cs, bo); + int index; + + if (!bo->handle) { + index = radeon_lookup_or_add_slab_buffer(cs, bo); + if (index < 0) + return 0; + + index = cs->csc->slab_buffers[index].u.slab.real_idx; + } else { + index = radeon_lookup_or_add_real_buffer(cs, bo); + } reloc = &cs->csc->relocs[index]; added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain); reloc->read_domains |= rd; reloc->write_domain |= wd; reloc->flags = MAX2(reloc->flags, priority); - cs->csc->relocs_bo[index].priority_usage |= 1llu << priority; + cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority; if (added_domains & RADEON_DOMAIN_VRAM) cs->base.used_vram += bo->base.size; else if (added_domains & RADEON_DOMAIN_GTT) cs->base.used_gart += bo->base.size; return index; } static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs, @@ -359,21 +432,21 @@ static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs, struct radeon_bo_list_item *list) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); int i; if (list) { for (i = 0; i < cs->csc->num_relocs; i++) { list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size; list[i].vm_address = cs->csc->relocs_bo[i].bo->va; - list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage; + list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage; } } return cs->csc->num_relocs; } void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index) { struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst; unsigned i; int r; @@ -577,20 +650,23 @@ static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs, struct radeon_bo *bo = (struct radeon_bo*)_buf; int index; if (!bo->num_cs_references) return false; index = radeon_lookup_buffer(cs->csc, bo); if (index == -1) return false; + if (!bo->handle) + index = cs->csc->slab_buffers[index].u.slab.real_idx; + if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain) return true; if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains) return true; return false; } /* FENCES */ diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h index bd55548..f9b26af 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h @@ -24,39 +24,50 @@ * of the Software. */ #ifndef RADEON_DRM_CS_H #define RADEON_DRM_CS_H #include "radeon_drm_bo.h" struct radeon_bo_item { struct radeon_bo *bo; - uint64_t priority_usage; + union { + struct { + uint64_t priority_usage; + } real; + struct { + unsigned real_idx; + } slab; + } u; }; struct radeon_cs_context { uint32_t buf[16 * 1024]; int fd; struct drm_radeon_cs cs; struct drm_radeon_cs_chunk chunks[3]; uint64_t chunk_array[3]; uint32_t flags[2]; /* Buffers. */ unsigned max_relocs; unsigned num_relocs; unsigned num_validated_relocs; struct radeon_bo_item *relocs_bo; struct drm_radeon_cs_reloc *relocs; + unsigned num_slab_buffers; + unsigned max_slab_buffers; + struct radeon_bo_item *slab_buffers; + int reloc_indices_hashlist[4096]; }; struct radeon_drm_cs { struct radeon_winsys_cs base; enum ring_type ring_type; /* We flip between these two CS. While one is being consumed * by the kernel in another thread, the other one is being filled * by the pipe driver. */ @@ -101,20 +112,23 @@ radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs, { int index; if (!bo->num_cs_references) return false; index = radeon_lookup_buffer(cs->csc, bo); if (index == -1) return false; + if (!bo->handle) + index = cs->csc->slab_buffers[index].u.slab.real_idx; + return cs->csc->relocs[index].write_domain != 0; } static inline bool radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo) { return bo->num_cs_references != 0; } void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs); diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h index 27fbe90..5514980 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h @@ -68,20 +68,21 @@ struct radeon_drm_winsys { struct pb_cache bo_cache; int fd; /* DRM file descriptor */ int num_cs; /* The number of command streams created. */ uint64_t allocated_vram; uint64_t allocated_gtt; uint64_t mapped_vram; uint64_t mapped_gtt; uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */ uint64_t num_cs_flushes; + uint32_t next_bo_hash; enum radeon_generation gen; struct radeon_info info; uint32_t va_start; uint32_t va_unmap_working; uint32_t accel_working2; /* List of buffer GEM names. Protected by bo_handles_mutex. */ struct util_hash_table *bo_names; /* List of buffer handles. Protectded by bo_handles_mutex. */ -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev