> Why does that makes a difference if it is seen for the first time? > > [ml] if it is presented for the first time for belonging ctx, means even > current CS do not involve context switch, we still need keep the actions in > preamble IB. > Usually if current CS is from the same cntx of previous CS, that means no ctx > switch occurs, so we can skip the actions in preamble IB. but above case is > the exception.
Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue. - Bas [ML] I'm confused, what's your point? With this patch, preamble_flag is not needed at all. Without this patch, many original assumption and logic is not correct. Besides, CONTEXT_CONTROL not only deals CE but also deal DE. BR Monk -----Original Message----- From: Bas Nieuwenhuizen [mailto:b...@basnieuwenhuizen.nl] Sent: Thursday, September 01, 2016 4:19 PM To: Liu, Monk <monk....@amd.com> Cc: Christian König <deathsim...@vodafone.de>; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3) On Thu, Sep 1, 2016 at 9:37 AM, Liu, Monk <monk....@amd.com> wrote: > > > -----Original Message----- > From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf > Of Christian K?nig > Sent: Wednesday, August 31, 2016 7:53 PM > To: Liu, Monk <monk....@amd.com>; amd-gfx@lists.freedesktop.org > Subject: Re: [PATCH] drm/amdgpu:implement CONTEXT_CONTROL (v3) > > Looks good to me in general, a few nit picks and sugegstions below. > > Am 31.08.2016 um 05:49 schrieb Monk Liu: >> v1: >> for gfx8, use CONTEXT_CONTROL package to dynamically skip preamble >> CEIB and other load_xxx command in sequence. >> >> v2: >> support GFX7 as well, and bump up version. >> remove cntxcntl in compute ring funcs because CPC doesn't support >> this packet. >> >> v3: fix reduntant judgement in cntxcntl. >> >> Change-Id: I4b87ca84ea8c11ba4f7fb4c0e8a5be537ccde851 >> Signed-off-by: Monk Liu <monk....@amd.com> >> >> Change-Id: I5d24c1bb5c14190ce4adeb6a331ee3d92b3d5c83 >> Signed-off-by: Monk Liu <monk....@amd.com> > > Only one signed of by line is enough and remove the change-ids. > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 12 ++++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 16 +++++++++------- >> drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 20 ++++++++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 30 ++++++++++++++++++++++++++++++ >> 6 files changed, 82 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 1254410..0de5f08 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs { >> void (*begin_use)(struct amdgpu_ring *ring); >> void (*end_use)(struct amdgpu_ring *ring); >> void (*emit_switch_buffer) (struct amdgpu_ring *ring); >> + void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t >> + flags); >> }; >> >> /* >> @@ -965,6 +966,7 @@ struct amdgpu_ctx { >> spinlock_t ring_lock; >> struct fence **fences; >> struct amdgpu_ctx_ring rings[AMDGPU_MAX_RINGS]; >> + bool preamble_presented; >> }; >> >> struct amdgpu_ctx_mgr { >> @@ -1227,8 +1229,13 @@ struct amdgpu_cs_parser { >> >> /* user fence */ >> struct amdgpu_bo_list_entry uf_entry; >> + bool preamble_present; /* True means this command submit >> +involves a preamble IB */ > > We only need this in amdgpu_cs_ib_fill() don't we? See below as well. > > [ML] seems good advice > >> }; >> >> +#define PREAMBLE_IB_PRESENT (1 << 0) /* bit set means command >> submit involves a preamble IB */ >> +#define PREAMBLE_IB_PRESENT_FIRST (1 << 1) /* bit set means preamble IB >> is first presented in belonging context */ > > Why does that makes a difference if it is seen for the first time? > > [ml] if it is presented for the first time for belonging ctx, means even > current CS do not involve context switch, we still need keep the actions in > preamble IB. > Usually if current CS is from the same cntx of previous CS, that means no ctx > switch occurs, so we can skip the actions in preamble IB. but above case is > the exception. Can't userspace just not set the preamble flag for the first submit with a preamble? I think that would result in the same behavior, unless having two non-preamble CE IB's in a single submit is an issue. - Bas > >> +#define HAVE_CTX_SWITCH (1 << 2) /* bit set means context >> switch occured */ >> + >> struct amdgpu_job { >> struct amd_sched_job base; >> struct amdgpu_device *adev; >> @@ -1237,6 +1244,7 @@ struct amdgpu_job { >> struct amdgpu_sync sync; >> struct amdgpu_ib *ibs; >> struct fence *fence; /* the hw fence */ >> + uint32_t preamble_status; >> uint32_t num_ibs; >> void *owner; >> uint64_t fence_ctx; /* the fence_context this job uses >> */ >> @@ -2264,6 +2272,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) >> #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r)) >> #define amdgpu_ring_emit_hdp_invalidate(r) >> (r)->funcs->emit_hdp_invalidate((r)) >> #define amdgpu_ring_emit_switch_buffer(r) >> (r)->funcs->emit_switch_buffer((r)) >> +#define amdgpu_ring_emit_cntxcntl(r, d) >> +(r)->funcs->emit_cntxcntl((r), (d)) >> #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) >> #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) >> #define amdgpu_ring_patch_cond_exec(r,o) >> (r)->funcs->patch_cond_exec((r),(o)) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index 2d4e005..6d8c050 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -792,6 +792,9 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, >> if (r) >> return r; >> >> + if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE) >> + parser->preamble_present = true; >> + >> if (parser->job->ring && parser->job->ring != ring) >> return -EINVAL; >> >> @@ -930,6 +933,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, >> return r; >> } >> >> + if (p->preamble_present) { >> + job->preamble_status |= PREAMBLE_IB_PRESENT; >> + if (!p->ctx->preamble_presented) >> + job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST; >> + } >> + > > Better move this to the end of amdgpu_cs_ib_fill() where we allocate the IBs > as well. > [ML] okay, good change. > > > >> job->owner = p->filp; >> job->fence_ctx = entity->fence_context; >> p->fence = fence_get(&job->base.s_fence->finished); >> @@ -940,6 +949,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, >> trace_amdgpu_cs_ioctl(job); >> amd_sched_entity_push_job(&job->base); >> >> + if (p->preamble_present) >> + p->ctx->preamble_presented = true; >> + >> return 0; >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index 56c85e6..44db0ab 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -55,9 +55,10 @@ >> * - 3.3.0 - Add VM support for UVD on supported hardware. >> * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS. >> * - 3.5.0 - Add support for new UVD_NO_OP register. >> + * - 3.6.0 - UMD doesn't/shouldn't need to use CONTEXT_CONTROL in >> + IB, KMD should do it >> */ >> #define KMS_DRIVER_MAJOR 3 >> -#define KMS_DRIVER_MINOR 5 >> +#define KMS_DRIVER_MINOR 6 >> #define KMS_DRIVER_PATCHLEVEL 0 >> >> int amdgpu_vram_limit = 0; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> index 04263f0..b12b5ba 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> @@ -121,10 +121,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, >> unsigned num_ibs, >> { >> struct amdgpu_device *adev = ring->adev; >> struct amdgpu_ib *ib = &ibs[0]; >> - bool skip_preamble, need_ctx_switch; >> + bool need_ctx_switch; >> unsigned patch_offset = ~0; >> struct amdgpu_vm *vm; >> uint64_t fence_ctx; >> + uint32_t status = 0; >> >> unsigned i; >> int r = 0; >> @@ -174,15 +175,16 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, >> unsigned num_ibs, >> /* always set cond_exec_polling to CONTINUE */ >> *ring->cond_exe_cpu_addr = 1; >> >> - skip_preamble = ring->current_ctx == fence_ctx; >> need_ctx_switch = ring->current_ctx != fence_ctx; >> + if (job && ring->funcs->emit_cntxcntl) { >> + if (need_ctx_switch) >> + status |= HAVE_CTX_SWITCH; >> + status |= job->preamble_status; >> + amdgpu_ring_emit_cntxcntl(ring, status); >> + } >> + >> for (i = 0; i < num_ibs; ++i) { >> ib = &ibs[i]; >> - >> - /* drop preamble IBs if we don't have a context switch */ >> - if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble) >> - continue; >> - > > Would be nice to keep this functionality for cases where we don't support > emit_cntxcntl (e.g. SI?). > [ML] SI support CONTEXT_CONTROL as well, and the package structure is exactly > the same as CI. > >> amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0, >> need_ctx_switch); >> need_ctx_switch = false; diff --git >> a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c >> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c >> index f055d49..0d5addb 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c >> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct >> amdgpu_ring *ring, >> amdgpu_ring_write(ring, control); >> } >> >> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, >> +uint32_t flags) { >> + uint32_t dw2 = 0; >> + >> + dw2 |= 0x80000000; /* set load_enable otherwise this package is just >> NOPs */ >> + if (flags & HAVE_CTX_SWITCH) { >> + /* set load_global_config & load_global_uconfig */ >> + dw2 |= 0x8001; >> + /* set load_cs_sh_regs */ >> + dw2 |= 0x01000000; >> + /* set load_per_context_state & load_gfx_sh_regs */ >> + dw2 |= 0x10002; > > Better define some constants for those. > > [ML] I'll leave it to other guys when doing cleanups, a little hurry for > other jobs now ... > > Regards, > Christian. > >> + } >> + >> + amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1)); >> + amdgpu_ring_write(ring, dw2); >> + amdgpu_ring_write(ring, 0); >> +} >> + >> /** >> * gfx_v7_0_ring_test_ib - basic ring IB test >> * >> @@ -4929,6 +4948,7 @@ static const struct amdgpu_ring_funcs >> gfx_v7_0_ring_funcs_gfx = { >> .test_ib = gfx_v7_0_ring_test_ib, >> .insert_nop = amdgpu_ring_insert_nop, >> .pad_ib = amdgpu_ring_generic_pad_ib, >> + .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl, >> }; >> >> static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = >> { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c >> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c >> index 8ba8e42..73f6ffa 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c >> @@ -6085,6 +6085,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring >> *ring) >> amdgpu_ring_write(ring, 0); >> } >> >> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, >> +uint32_t flags) { >> + uint32_t dw2 = 0; >> + >> + dw2 |= 0x80000000; /* set load_enable otherwise this package is just >> NOPs */ >> + if (flags & HAVE_CTX_SWITCH) { >> + /* set load_global_config & load_global_uconfig */ >> + dw2 |= 0x8001; >> + /* set load_cs_sh_regs */ >> + dw2 |= 0x01000000; >> + /* set load_per_context_state & load_gfx_sh_regs for GFX */ >> + dw2 |= 0x10002; >> + >> + /* set load_ce_ram if preamble presented */ >> + if (PREAMBLE_IB_PRESENT & flags) >> + dw2 |= 0x10000000; >> + } else { >> + /* still load_ce_ram if this is the first time preamble >> presented >> + * although there is no context switch happens. >> + */ >> + if (PREAMBLE_IB_PRESENT_FIRST & flags) >> + dw2 |= 0x10000000; >> + } >> + >> + amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1)); >> + amdgpu_ring_write(ring, dw2); >> + amdgpu_ring_write(ring, 0); >> +} >> + >> static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device >> *adev, >> enum amdgpu_interrupt_state >> state) >> { >> @@ -6267,6 +6296,7 @@ static const struct amdgpu_ring_funcs >> gfx_v8_0_ring_funcs_gfx = { >> .insert_nop = amdgpu_ring_insert_nop, >> .pad_ib = amdgpu_ring_generic_pad_ib, >> .emit_switch_buffer = gfx_v8_ring_emit_sb, >> + .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl, >> }; >> >> static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = >> { > > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx