Patch 5-7 look good, but I prefer that more experienced eyes take a look too.
Acked-by: Eduardo Lima Mitev <el...@igalia.com> On 01/17/2017 08:14 AM, Kenneth Graunke wrote: > The non-LLC story was a horror show. We uploaded data via pwrite > (drm_intel_bo_subdata), which would stall if the cache BO was in > use (being read) by the GPU. Obviously, we wanted to avoid that. > So, we tried to detect whether the buffer was busy, and if so, we'd > allocate a new BO, map the old one read-only (hopefully not stalling), > copy all shaders compiled since the dawn of time to the new buffer, > upload our new one, toss the old BO, and let the state upload code > know that our program cache BO changed. This was a lot of extra data > copying, and flagging BRW_NEW_PROGRAM_CACHE would also cause a new > STATE_BASE_ADDRESS to be emitted, stalling the entire pipeline. > > Not only that, but our rudimentary busy tracking consistented of a flag > set at execbuf time, and not cleared until we threw out the program > cache BO. So, the first shader upload after any drawing would hit this > "abandon the cache and start over" copying path. > > None of this is necessary - it's just ancient crufty code. We can > use the same persistent mapping paths on all platforms. On non-LLC > platforms, we simply need to clflush after memcpy'ing in new shaders, > so they're visible to the GPU. This is not only better, but the code > is also significantly simpler. > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_context.h | 1 - > src/mesa/drivers/dri/i965/brw_program_cache.c | 58 > ++++++++------------------- > src/mesa/drivers/dri/i965/intel_batchbuffer.c | 6 --- > 3 files changed, 16 insertions(+), 49 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index b032d511a1e..70f240383b1 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -461,7 +461,6 @@ struct brw_cache { > GLuint size, n_items; > > uint32_t next_offset; > - bool bo_used_by_gpu; > }; > > > diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c > b/src/mesa/drivers/dri/i965/brw_program_cache.c > index 0c1da172588..b76ff12e9fe 100644 > --- a/src/mesa/drivers/dri/i965/brw_program_cache.c > +++ b/src/mesa/drivers/dri/i965/brw_program_cache.c > @@ -207,32 +207,31 @@ brw_search_cache(struct brw_cache *cache, > } > > static void > +memcpy_coherent(bool has_llc, void *dest, const void *src, size_t n) > +{ > + memcpy(dest, src, n); > + if (!has_llc) > + brw_clflush_range(dest, n); > +} > + > +static void > brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size) > { > struct brw_context *brw = cache->brw; > drm_intel_bo *new_bo; > > new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64); > - if (brw->has_llc) > - drm_intel_bo_map(new_bo, true); > + drm_intel_bo_map(new_bo, true); > > /* Copy any existing data that needs to be saved. */ > if (cache->next_offset != 0) { > - if (brw->has_llc) { > - memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset); > - } else { > - drm_intel_bo_map(cache->bo, false); > - drm_intel_bo_subdata(new_bo, 0, cache->next_offset, > - cache->bo->virtual); > - drm_intel_bo_unmap(cache->bo); > - } > + memcpy_coherent(brw->has_llc, new_bo->virtual, cache->bo->virtual, > + cache->next_offset); > } > > - if (brw->has_llc) > - drm_intel_bo_unmap(cache->bo); > + drm_intel_bo_unmap(cache->bo); > drm_intel_bo_unreference(cache->bo); > cache->bo = new_bo; > - cache->bo_used_by_gpu = false; > > /* Since we have a new BO in place, we need to signal the units > * that depend on it (state base address on gen5+, or unit state before). > @@ -249,7 +248,6 @@ brw_lookup_prog(const struct brw_cache *cache, > enum brw_cache_id cache_id, > const void *data, unsigned data_size) > { > - const struct brw_context *brw = cache->brw; > unsigned i; > const struct brw_cache_item *item; > > @@ -260,11 +258,7 @@ brw_lookup_prog(const struct brw_cache *cache, > if (item->cache_id != cache_id || item->size != data_size) > continue; > > - if (!brw->has_llc) > - drm_intel_bo_map(cache->bo, false); > ret = memcmp(cache->bo->virtual + item->offset, data, item->size); > - if (!brw->has_llc) > - drm_intel_bo_unmap(cache->bo); > if (ret) > continue; > > @@ -279,7 +273,6 @@ static uint32_t > brw_alloc_item_data(struct brw_cache *cache, uint32_t size) > { > uint32_t offset; > - struct brw_context *brw = cache->brw; > > /* Allocate space in the cache BO for our new program. */ > if (cache->next_offset + size > cache->bo->size) { > @@ -291,14 +284,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t > size) > brw_cache_new_bo(cache, new_size); > } > > - /* If we would block on writing to an in-use program BO, just > - * recreate it. > - */ > - if (!brw->has_llc && cache->bo_used_by_gpu) { > - perf_debug("Copying busy program cache buffer.\n"); > - brw_cache_new_bo(cache, cache->bo->size); > - } > - > offset = cache->next_offset; > > /* Programs are always 64-byte aligned, so set up the next one now */ > @@ -363,11 +348,8 @@ brw_upload_cache(struct brw_cache *cache, > item->offset = brw_alloc_item_data(cache, data_size); > > /* Copy data to the buffer */ > - if (brw->has_llc) { > - memcpy((char *)cache->bo->virtual + item->offset, data, data_size); > - } else { > - drm_intel_bo_subdata(cache->bo, item->offset, data_size, data); > - } > + memcpy_coherent(brw->has_llc, cache->bo->virtual + item->offset, > + data, data_size); > } > > /* Set up the memory containing the key and aux_data */ > @@ -404,8 +386,7 @@ brw_init_caches(struct brw_context *brw) > calloc(cache->size, sizeof(struct brw_cache_item *)); > > cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", 4096, 64); > - if (brw->has_llc) > - drm_intel_bo_map(cache->bo, true); > + drm_intel_bo_map(cache->bo, true); > } > > static void > @@ -482,8 +463,7 @@ brw_destroy_cache(struct brw_context *brw, struct > brw_cache *cache) > > DBG("%s\n", __func__); > > - if (brw->has_llc) > - drm_intel_bo_unmap(cache->bo); > + drm_intel_bo_unmap(cache->bo); > drm_intel_bo_unreference(cache->bo); > cache->bo = NULL; > brw_clear_cache(brw, cache); > @@ -532,9 +512,6 @@ brw_print_program_cache(struct brw_context *brw) > const struct brw_cache *cache = &brw->cache; > struct brw_cache_item *item; > > - if (!brw->has_llc) > - drm_intel_bo_map(cache->bo, false); > - > for (unsigned i = 0; i < cache->size; i++) { > for (item = cache->items[i]; item; item = item->next) { > fprintf(stderr, "%s:\n", cache_name(i)); > @@ -542,7 +519,4 @@ brw_print_program_cache(struct brw_context *brw) > item->offset, item->size, stderr); > } > } > - > - if (!brw->has_llc) > - drm_intel_bo_unmap(cache->bo); > } > diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c > b/src/mesa/drivers/dri/i965/intel_batchbuffer.c > index d1b9317a8c1..29ff49d06f1 100644 > --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c > +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c > @@ -265,12 +265,6 @@ brw_finish_batch(struct brw_context *brw) > PIPE_CONTROL_CS_STALL); > } > } > - > - /* Mark that the current program cache BO has been used by the GPU. > - * It will be reallocated if we need to put new programs in for the > - * next batch. > - */ > - brw->cache.bo_used_by_gpu = true; > } > > static void > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev