void si_gfx_write_event_eop(struct si_context *ctx,
unsigned event, unsigned event_flags,
- unsigned data_sel,
+ unsigned dst_sel, unsigned int_sel, unsigned
data_sel,
struct r600_resource *buf, uint64_t va,
uint32_t new_fence, unsigned query_type)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;
unsigned op = EVENT_TYPE(event) |
- EVENT_INDEX(5) |
+ EVENT_INDEX(event == V_028A90_CS_DONE ||
+ event == V_028A90_PS_DONE ? 6 : 5) |
event_flags;
- unsigned sel = EOP_DATA_SEL(data_sel);
-
- /* Wait for write confirmation before writing data, but don't send
- * an interrupt. */
- if (data_sel != EOP_DATA_SEL_DISCARD)
- sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
+ unsigned sel = EOP_DST_SEL(dst_sel) |
+ EOP_INT_SEL(int_sel) |
+ EOP_DATA_SEL(data_sel);
if (ctx->chip_class >= GFX9) {
/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
* counters) must immediately precede every timestamp event to
* prevent a GPU hang on GFX9.
*
* Occlusion queries don't need to do it here, because they
* always do ZPASS_DONE before the timestamp.
*/
if (ctx->chip_class == GFX9 &&
@@ -268,21 +266,24 @@ static void si_fine_fence_set(struct si_context *ctx,
if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
struct radeon_cmdbuf *cs = ctx->gfx_cs;
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_PFP));
radeon_emit(cs, fence_va);
radeon_emit(cs, fence_va >> 32);
radeon_emit(cs, 0x80000000);
} else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
- si_gfx_write_event_eop(ctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ si_gfx_write_event_eop(ctx,
+ V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
EOP_DATA_SEL_VALUE_32BIT,
NULL, fence_va, 0x80000000,
PIPE_QUERY_GPU_FINISHED);
} else {
assert(false);
}
}
static boolean si_fence_finish(struct pipe_screen *screen,
struct pipe_context *ctx,
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c
b/src/gallium/drivers/radeonsi/si_perfcounter.c
index de71572c8aa..f3ef3d28c8a 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -574,20 +574,22 @@ static void si_pc_emit_start(struct si_context *sctx,
}
/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
* do it again in here. */
static void si_pc_emit_stop(struct si_context *sctx,
struct r600_resource *buffer, uint64_t va)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
si_gfx_write_event_eop(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
EOP_DATA_SEL_VALUE_32BIT,
buffer, va, 0, SI_NOT_QUERY);
si_gfx_wait_fence(sctx, va, 0, 0xffffffff);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) |
EVENT_INDEX(0));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
b/src/gallium/drivers/radeonsi/si_pipe.h
index 29d7e555a0c..73c54df3a03 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1165,21 +1165,21 @@ void si_sdma_clear_buffer(struct si_context *sctx,
struct pipe_resource *dst,
void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
struct r600_resource *dst, struct r600_resource *src);
void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
struct pipe_fence_handle **fence);
void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource
*dst,
uint64_t offset, uint64_t size, unsigned value);
/* si_fence.c */
void si_gfx_write_event_eop(struct si_context *ctx,
unsigned event, unsigned event_flags,
- unsigned data_sel,
+ unsigned dst_sel, unsigned int_sel, unsigned
data_sel,
struct r600_resource *buf, uint64_t va,
uint32_t new_fence, unsigned query_type);
unsigned si_gfx_write_fence_dwords(struct si_screen *screen);
void si_gfx_wait_fence(struct si_context *ctx,
uint64_t va, uint32_t ref, uint32_t mask);
void si_init_fence_functions(struct si_context *ctx);
void si_init_screen_fence_functions(struct si_screen *screen);
struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
struct tc_unflushed_batch_token
*tc_token);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index bdd7e2c060c..45c8e146ecf 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -884,21 +884,23 @@ static void si_query_hw_do_emit_stop(struct si_context
*sctx,
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
va += 16;
for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
emit_sample_streamout(cs, va + 32 * stream, stream);
break;
case PIPE_QUERY_TIME_ELAPSED:
va += 8;
/* fall through */
case PIPE_QUERY_TIMESTAMP:
si_gfx_write_event_eop(sctx, V_028A90_BOTTOM_OF_PIPE_TS,
- 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
+ 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_TIMESTAMP, NULL, va,
0, query->b.type);
fence_va = va + 8;
break;
case PIPE_QUERY_PIPELINE_STATISTICS: {
unsigned sample_size = (query->result_size - 8) / 2;
va += sample_size;
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) |
EVENT_INDEX(2));
radeon_emit(cs, va);
@@ -906,25 +908,28 @@ static void si_query_hw_do_emit_stop(struct si_context
*sctx,
fence_va = va + sample_size;
break;
}
default:
assert(0);
}
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf,
RADEON_USAGE_WRITE,
RADEON_PRIO_QUERY);
- if (fence_va)
+ if (fence_va) {
si_gfx_write_event_eop(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
EOP_DATA_SEL_VALUE_32BIT,
query->buffer.buf, fence_va, 0x80000000,
query->b.type);
+ }
}
static void si_query_hw_emit_stop(struct si_context *sctx,
struct si_query_hw *query)
{
uint64_t va;
if (!query->buffer.buf)
return; // previous buffer allocation failure
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index fceb9debc47..3d56d8e9ab4 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -911,22 +911,24 @@ void si_emit_cache_flush(struct si_context *sctx)
S_0085F0_CB1_DEST_BASE_ENA(1) |
S_0085F0_CB2_DEST_BASE_ENA(1) |
S_0085F0_CB3_DEST_BASE_ENA(1) |
S_0085F0_CB4_DEST_BASE_ENA(1) |
S_0085F0_CB5_DEST_BASE_ENA(1) |
S_0085F0_CB6_DEST_BASE_ENA(1) |
S_0085F0_CB7_DEST_BASE_ENA(1);
/* Necessary for DCC */
if (sctx->chip_class == VI)
- si_gfx_write_event_eop(sctx,
V_028A90_FLUSH_AND_INV_CB_DATA_TS,
- 0, EOP_DATA_SEL_DISCARD,
NULL,
+ si_gfx_write_event_eop(sctx,
+
V_028A90_FLUSH_AND_INV_CB_DATA_TS,
+ 0, EOP_DST_SEL_MEM,
EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_DISCARD,
NULL,
0, 0, SI_NOT_QUERY);
}
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
S_0085F0_DB_DEST_BASE_ENA(1);
}
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -1027,20 +1029,22 @@ void si_emit_cache_flush(struct si_context *sctx)
SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
SI_CONTEXT_INV_VMEM_L1);
sctx->num_L2_invalidates++;
}
/* Do the flush (enqueue the event and wait for it). */
va = sctx->wait_mem_scratch->gpu_address;
sctx->wait_mem_number++;
si_gfx_write_event_eop(sctx, cb_db_event, tc_flags,
+ EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
EOP_DATA_SEL_VALUE_32BIT,
sctx->wait_mem_scratch, va,
sctx->wait_mem_number, SI_NOT_QUERY);
si_gfx_wait_fence(sctx, va, sctx->wait_mem_number, 0xffffffff);
}
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
if (cp_coher_cntl ||