OK. Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Marek On Tue, Apr 19, 2016 at 1:19 PM, Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> wrote: > Write by ssbo/image is completely synchronized by glMemoryBarrier, not > by the driver. Even for the write after read hazard, the user needs to > put a barrier before the write. This includes all writes by compute. > > For transform feedback we apply the same rules as for graphics: > reading from the feedback bfufer might go wrong if transform feedback > is still active, otherwise the driver synchronizes. I can't really > find anything in the GL spec that disallows the loop though. > > For write via framebuffer, writing to a framebuffer and reading it > from a CS is a render feedback loop, both for write after read and > read after write. The texture regions therefore need to be disjunct or > a glTextureBarrier() needs to be used by the user. I am not sure if > the glTextureBarrier should be enough to prevent a write after read > hazard, as the spec only talks about it solving the read after write > hazard. However, in my current patch it solves both, and I think the > user needs some form of synchronization there, probably rebinding the > framebuffer. Furthermore, we have this issue too without compute > shaders. > > - Bas > > On Tue, Apr 19, 2016 at 12:50 PM, Marek Olšák <mar...@gmail.com> wrote: >> There can be read-after-write hazards when transitioning from compute >> to graphics and vice versa. Is the user expected to call >> glMemoryBarrier in this case or do we need to synchronize explicitly >> in the driver? >> >> Marek >> >> On Tue, Apr 19, 2016 at 1:39 AM, Bas Nieuwenhuizen >> <b...@basnieuwenhuizen.nl> wrote: >>> v2: Add more CS_PARTIAL_FLUSH events. >>> >>> Essentially every place with waits on finishing for pixel shaders >>> also has a write after read hazard with compute shaders. >>> >>> Invalidating L2 waits implicitly on pixel and compute shaders, >>> so, we don't need a CS_PARTIAL_FLUSH for switching FBO. >>> >>> v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2. >>> >>> According to Marek the INV_GLOBAL_L2 events don't wait for compute >>> shaders to finish, so wait for them explicitly. >>> >>> Signed-off-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> >>> --- >>> src/gallium/drivers/radeonsi/si_compute.c | 17 ++--------------- >>> src/gallium/drivers/radeonsi/si_cp_dma.c | 6 ++++-- >>> src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++- >>> src/gallium/drivers/radeonsi/si_hw_context.c | 1 + >>> src/gallium/drivers/radeonsi/si_state.c | 12 ++++++++---- >>> 5 files changed, 17 insertions(+), 22 deletions(-) >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >>> b/src/gallium/drivers/radeonsi/si_compute.c >>> index 10b88b3..6803334 100644 >>> --- a/src/gallium/drivers/radeonsi/si_compute.c >>> +++ b/src/gallium/drivers/radeonsi/si_compute.c >>> @@ -439,13 +439,8 @@ static void si_launch_grid( >>> if (!sctx->cs_shader_state.initialized) >>> si_initialize_compute(sctx); >>> >>> - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | >>> - SI_CONTEXT_INV_GLOBAL_L2 | >>> - SI_CONTEXT_INV_ICACHE | >>> - SI_CONTEXT_INV_SMEM_L1 | >>> - SI_CONTEXT_FLUSH_WITH_INV_L2 | >>> - SI_CONTEXT_FLAG_COMPUTE; >>> - si_emit_cache_flush(sctx, NULL); >>> + if (sctx->b.flags) >>> + si_emit_cache_flush(sctx, NULL); >>> >>> if (!si_switch_compute_shader(sctx, program, &program->shader, >>> info->pc)) >>> return; >>> @@ -478,14 +473,6 @@ static void si_launch_grid( >>> si_setup_tgsi_grid(sctx, info); >>> >>> si_emit_dispatch_packets(sctx, info); >>> - >>> - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | >>> - SI_CONTEXT_INV_VMEM_L1 | >>> - SI_CONTEXT_INV_GLOBAL_L2 | >>> - SI_CONTEXT_INV_ICACHE | >>> - SI_CONTEXT_INV_SMEM_L1 | >>> - SI_CONTEXT_FLAG_COMPUTE; >>> - si_emit_cache_flush(sctx, NULL); >>> } >>> >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c >>> b/src/gallium/drivers/radeonsi/si_cp_dma.c >>> index 001ddd4..38e0ee6 100644 >>> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c >>> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c >>> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, >>> struct pipe_resource *dst, >>> uint64_t va = r600_resource(dst)->gpu_address + offset; >>> >>> /* Flush the caches. */ >>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; >>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; >>> >>> while (size) { >>> unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); >>> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx, >>> } >>> >>> /* Flush the caches. */ >>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; >>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; >>> >>> /* This is the main part doing the copying. Src is always aligned. >>> */ >>> main_dst_offset = dst_offset + skipped_size; >>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c >>> b/src/gallium/drivers/radeonsi/si_descriptors.c >>> index 5b65fae..98ad3a7 100644 >>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c >>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c >>> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct >>> pipe_context *ctx, >>> * start writing to the targets. >>> */ >>> if (num_targets) >>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; >>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH; >>> >>> /* Streamout buffers must be bound in 2 places: >>> * 1) in VGT by setting the VGT_STRMOUT registers >>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c >>> b/src/gallium/drivers/radeonsi/si_hw_context.c >>> index 9862f07..b179092e 100644 >>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c >>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c >>> @@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags, >>> ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | >>> SI_CONTEXT_INV_VMEM_L1 | >>> SI_CONTEXT_INV_GLOBAL_L2 | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH | >>> /* this is probably not needed anymore */ >>> SI_CONTEXT_PS_PARTIAL_FLUSH; >>> si_emit_cache_flush(ctx, NULL); >>> diff --git a/src/gallium/drivers/radeonsi/si_state.c >>> b/src/gallium/drivers/radeonsi/si_state.c >>> index af9ffdd..305a70b 100644 >>> --- a/src/gallium/drivers/radeonsi/si_state.c >>> +++ b/src/gallium/drivers/radeonsi/si_state.c >>> @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct >>> pipe_context *ctx, >>> */ >>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | >>> SI_CONTEXT_INV_GLOBAL_L2 | >>> - SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; >>> + SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH; >>> >>> /* Take the maximum of the old and new count. If the new count is >>> lower, >>> * dirtying is needed to disable the unbound colorbuffers. >>> @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context >>> *ctx) >>> >>> sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | >>> SI_CONTEXT_INV_GLOBAL_L2 | >>> - SI_CONTEXT_FLUSH_AND_INV_CB; >>> + SI_CONTEXT_FLUSH_AND_INV_CB | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH; >>> } >>> >>> static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) >>> @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context >>> *ctx, unsigned flags) >>> >>> /* Subsequent commands must wait for all shader invocations to >>> * complete. */ >>> - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; >>> + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> + SI_CONTEXT_CS_PARTIAL_FLUSH; >>> >>> if (flags & PIPE_BARRIER_CONSTANT_BUFFER) >>> sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | >>> @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context >>> *ctx, unsigned flags) >>> PIPE_BARRIER_SHADER_BUFFER | >>> PIPE_BARRIER_TEXTURE | >>> PIPE_BARRIER_IMAGE | >>> - PIPE_BARRIER_STREAMOUT_BUFFER)) { >>> + PIPE_BARRIER_STREAMOUT_BUFFER | >>> + PIPE_BARRIER_GLOBAL_BUFFER)) { >>> /* As far as I can tell, L1 contents are written back to L2 >>> * automatically at end of shader, but the contents of other >>> * L1 caches might still be stale. */ >>> -- >>> 2.8.0 >>> >>> _______________________________________________ >>> mesa-dev mailing list >>> mesa-dev@lists.freedesktop.org >>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev