On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukha...@amd.com> wrote: > > On 8/1/2024 8:49 AM, Marek Olšák wrote: > > On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.kha...@amd.com> > wrote: > >> Adding NOP packets one by one in the ring > >> does not use the CP efficiently. > >> > >> Solution: > >> Use CP optimization while adding NOP packet's so PFP > >> can discard NOP packets based on information of count > >> from the Header instead of fetching all NOP packets > >> one by one. > >> > >> Cc: Christian König <christian.koe...@amd.com> > >> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> > >> Cc: Tvrtko Ursulin <tursu...@igalia.com> > >> Cc: Marek Olšák <marek.ol...@amd.com> > >> Signed-off-by: Sunil Khatri <sunil.kha...@amd.com> > >> --- > >> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++--- > >> 1 file changed, 21 insertions(+), 3 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> index 853084a2ce7f..edf5b5c4d185 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > >> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct > amdgpu_ring *ring) > >> amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */ > >> } > >> > >> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, > uint32_t num_nop) > >> +{ > >> + int i; > >> + > >> + /* Header itself is a NOP packet */ > >> + if (num_nop == 1) { > >> + amdgpu_ring_write(ring, ring->funcs->nop); > >> + return; > >> + } > >> + > >> + /* Max HW optimization till 0x3ffe, followed by remaining one > NOP at a time*/ > >> + amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, > 0x3ffe))); > >> + > >> + /* Header is at index 0, followed by num_nops - 1 NOP packet's > */ > >> + for (i = 1; i < num_nop; i++) > >> + amdgpu_ring_write(ring, ring->funcs->nop); > > This loop should be removed. It's unnecessary CPU overhead and we > > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the > > whole packet body uninitialized is the fastest option. > That was the original intent to just move the WPTR for the no of nops > and tried too. Based on Christian inputs we should not let the nops packet > > as garbage or whatever was there originally as a threat/safety measure.
It doesn't help safety. It can only be read by the GPU with kernel-level permissions. Initializing the packet body is useless and adds CPU overhead, especially with the 256 NOPs or so that we use for no reason. Marek > So CPU side there isnt any optimization but just CP will skip all these > so GPU side should see the benefit. > > Regards > Sunil Khatri > > > > > Marek > > > >> +} > >> + > >> static void gfx_v10_ip_print(void *handle, struct drm_printer *p) > >> { > >> struct amdgpu_device *adev = (struct amdgpu_device *)handle; > >> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_gfx = { > >> .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush, > >> .test_ring = gfx_v10_0_ring_test_ring, > >> .test_ib = gfx_v10_0_ring_test_ib, > >> - .insert_nop = amdgpu_ring_insert_nop, > >> + .insert_nop = amdgpu_gfx10_ring_insert_nop, > >> .pad_ib = amdgpu_ring_generic_pad_ib, > >> .emit_switch_buffer = gfx_v10_0_ring_emit_sb, > >> .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl, > >> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_compute = { > >> .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush, > >> .test_ring = gfx_v10_0_ring_test_ring, > >> .test_ib = gfx_v10_0_ring_test_ib, > >> - .insert_nop = amdgpu_ring_insert_nop, > >> + .insert_nop = amdgpu_gfx10_ring_insert_nop, > >> .pad_ib = amdgpu_ring_generic_pad_ib, > >> .emit_wreg = gfx_v10_0_ring_emit_wreg, > >> .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > >> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_kiq = { > >> .emit_fence = gfx_v10_0_ring_emit_fence_kiq, > >> .test_ring = gfx_v10_0_ring_test_ring, > >> .test_ib = gfx_v10_0_ring_test_ib, > >> - .insert_nop = amdgpu_ring_insert_nop, > >> + .insert_nop = amdgpu_gfx10_ring_insert_nop, > >> .pad_ib = amdgpu_ring_generic_pad_ib, > >> .emit_rreg = gfx_v10_0_ring_emit_rreg, > >> .emit_wreg = gfx_v10_0_ring_emit_wreg, > >> -- > >> 2.34.1 > >> >