gem_exec_latency: Measure the latency of context switching

Mika Kuoppala Wed, 05 Jun 2019 08:04:35 -0700

Chris Wilson <ch...@chris-wilson.co.uk> writes:

> Measure the baseline latency between contexts in order to directly
> compare that with the additional cost of preemption.
>
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> ---
>  tests/i915/gem_exec_latency.c | 230 ++++++++++++++++++++++++++++++++++
>  1 file changed, 230 insertions(+)
>
> diff --git a/tests/i915/gem_exec_latency.c b/tests/i915/gem_exec_latency.c
> index e56d62780..e88fbbc6a 100644
> --- a/tests/i915/gem_exec_latency.c
> +++ b/tests/i915/gem_exec_latency.c
> @@ -410,6 +410,86 @@ static void latency_from_ring(int fd,
>       }
>  }
>  
> +static void execution_latency(int i915, unsigned int ring, const char *name)
> +{
> +     struct drm_i915_gem_exec_object2 obj = {
> +             .handle = gem_create(i915, 4095),
> +     };
> +     struct drm_i915_gem_execbuffer2 execbuf = {
> +             .buffers_ptr = to_user_pointer(&obj),
> +             .buffer_count = 1,
> +             .flags = ring | LOCAL_I915_EXEC_NO_RELOC | 
> LOCAL_I915_EXEC_HANDLE_LUT,
> +     };
> +     const unsigned int mmio_base = 0x2000;
> +     const unsigned int cs_timestamp = mmio_base + 0x358;
> +     volatile uint32_t *timestamp;
> +     uint32_t *cs, *result;
> +
> +     timestamp =
> +             (volatile uint32_t *)((volatile char *)igt_global_mmio + 
> cs_timestamp);
> +
> +     obj.handle = gem_create(i915, 4096);
> +     obj.flags = EXEC_OBJECT_PINNED;
> +     result = gem_mmap__wc(i915, obj.handle, 0, 4096, PROT_WRITE);
> +
> +     for (int i = 0; i < 16; i++) {
> +             cs = result + 16 * i;
> +             *cs++ = 0x24 << 23 | 2; /* SRM */
> +             *cs++ = cs_timestamp;
> +             *cs++ = 4096 - 16 * 4 + i * 4;
> +             *cs++ = 0;
> +             *cs++ = 0xa << 23;


Why not MI_BATCH_BUFFER_END? To emphasize that we have
multiple batches inside a bo?

> +     }
> +
> +     cs = result + 1024 - 16;
> +
> +     for (int length = 2; length <= 16; length <<= 1) {
> +             struct igt_mean submit, batch, total;
> +             int last = length - 1;
> +
> +             igt_mean_init(&submit);
> +             igt_mean_init(&batch);
> +             igt_mean_init(&total);
> +
> +             igt_until_timeout(2) {
> +                     uint32_t now, end;
> +
> +                     cs[last] = 0;
> +
> +                     now = *timestamp;
> +                     for (int i = 0; i < length; i++) {
> +                             execbuf.batch_start_offset = 64 * i;
> +                             gem_execbuf(i915, &execbuf);
> +                     }
> +                     while (!((volatile uint32_t *)cs)[last])
> +                             ;
> +                     end = *timestamp;
> +
> +                     igt_mean_add(&submit, (cs[0] - now) * rcs_clock);
> +                     igt_mean_add(&batch, (cs[last] - cs[0]) * rcs_clock / 
> last);

Just curious of what do you use of inter batch latency?

Oh and do we need to to take the rcs_clock resolution into account
on result calculation. Prolly not as it seems to be ticking fast enough
for 0.1us accuracy.

> +                     igt_mean_add(&total, (end - now) * rcs_clock);
> +             }
> +
> +             igt_info("%sx%d Submission latency: %.2f±%.2fus\n",
> +                      name, length,
> +                      1e-3 * igt_mean_get(&submit),
> +                      1e-3 * sqrt(igt_mean_get_variance(&submit)));
> +
> +             igt_info("%sx%d Inter-batch latency: %.2f±%.2fus\n",
> +                      name, length,
> +                      1e-3 * igt_mean_get(&batch),
> +                      1e-3 * sqrt(igt_mean_get_variance(&batch)));
> +
> +             igt_info("%sx%d End-to-end latency: %.2f±%.2fus\n",
> +                      name, length,
> +                      1e-3 * igt_mean_get(&total),
> +                      1e-3 * sqrt(igt_mean_get_variance(&total)));
> +     }
> +
> +     munmap(result, 4096);
> +     gem_close(i915, obj.handle);
> +}
> +
>  static void
>  __submit_spin(int fd, igt_spin_t *spin, unsigned int flags)
>  {
> @@ -616,6 +696,142 @@ rthog_latency_on_ring(int fd, unsigned int engine, 
> const char *name, unsigned in
>       munmap(results, MMAP_SZ);
>  }
>  
> +static void context_switch(int i915,
> +                        unsigned int engine, const char *name,
> +                        unsigned int flags)
> +{
> +     struct drm_i915_gem_exec_object2 obj[2];
> +     struct drm_i915_gem_relocation_entry reloc[5];
> +     struct drm_i915_gem_execbuffer2 eb;
> +     uint32_t *cs, *bbe, *results, v;
> +     unsigned int mmio_base;
> +     struct igt_mean mean;
> +     uint32_t ctx[2];
> +
> +     /* XXX i915_query()! */
> +     switch (engine) {
> +     case I915_EXEC_DEFAULT:
> +     case I915_EXEC_RENDER:
> +             mmio_base = 0x2000;
> +             break;
> +#if 0
> +     case I915_EXEC_BSD:
> +             mmio_base = 0x12000;
> +             break;
> +#endif

It seems to be there according to bspec. Timestamps don't
work with it?

> +     case I915_EXEC_BLT:
> +             mmio_base = 0x22000;
> +             break;
> +
> +     case I915_EXEC_VEBOX:
> +             if (intel_gen(intel_get_drm_devid(i915)) >= 11)
> +                     mmio_base = 0x1d8000;
> +             else
> +                     mmio_base = 0x1a000;
> +             break;
> +
> +     default:
> +             igt_skip("mmio base not known\n");
> +     }
> +
> +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +             ctx[i] = gem_context_create(i915);
> +
> +     if (flags & PREEMPT) {
> +             gem_context_set_priority(i915, ctx[0], -1023);
> +             gem_context_set_priority(i915, ctx[1], +1023);
> +     }
> +
> +     memset(obj, 0, sizeof(obj));
> +     obj[0].handle = gem_create(i915, 4096);
> +     gem_set_caching(i915, obj[0].handle, 1);
> +     results = gem_mmap__cpu(i915, obj[0].handle, 0, 4096, PROT_READ);
> +     gem_set_domain(i915, obj[0].handle, I915_GEM_DOMAIN_CPU, 0);
> +
> +     obj[1].handle = gem_create(i915, 4096);
> +     memset(reloc,0, sizeof(reloc));
> +     obj[1].relocation_count = ARRAY_SIZE(reloc);
> +     obj[1].relocs_ptr = to_user_pointer(reloc);
> +     bbe = gem_mmap__wc(i915, obj[1].handle, 0, 4096, PROT_WRITE);
> +     gem_set_domain(i915, obj[1].handle,
> +                    I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
> +
> +     cs = bbe;
> +     *cs++ = 0x5 << 23;

/* ARB_CHK */
or even better MI_ARB_CHK;

> +     *cs++ = 0x24 << 23 | 2; /* SRM */
> +     *cs++ = mmio_base + 0x358; /* TIMESTAMP */
> +     reloc[0].target_handle = obj[0].handle;
> +     reloc[0].offset = (cs - bbe) * sizeof(*cs);
> +     *cs++ = 0;
> +     *cs++ = 0;
> +     *cs++ = MI_BATCH_BUFFER_START | 1;
> +     reloc[1].target_handle = obj[1].handle;
> +     reloc[1].offset = (cs - bbe) * sizeof(*cs);
> +     *cs++ = 0;
> +     *cs++ = 0;
> +
> +     cs = bbe + 64;
> +     *cs++ = 0x24 << 23 | 2; /* SRM */
> +     *cs++ = mmio_base + 0x358; /* TIMESTAMP */
> +     reloc[2].target_handle = obj[0].handle;
> +     reloc[2].offset = (cs - bbe) * sizeof(*cs);
> +     *cs++ = reloc[2].delta = 4;
> +     *cs++ = 0;
> +     *cs++ = 0x29 << 23 | 2; /* LRM */
> +     *cs++ = mmio_base + 0x600; /* GPR0 */
> +     reloc[3].target_handle = obj[0].handle;
> +     reloc[3].offset = (cs - bbe) * sizeof(*cs);
> +     *cs++ = 0;
> +     *cs++ = 0;
> +     *cs++ = 0x24 << 23 | 2; /* SRM */
> +     *cs++ = mmio_base + 0x600; /* GPR0 */
> +     reloc[4].target_handle = obj[0].handle;
> +     reloc[4].offset = (cs - bbe) * sizeof(*cs);
> +     *cs++ = reloc[4].delta = 8;
> +     *cs++ = 0;
> +     *cs++ = 0xa << 23;
> +
> +     memset(&eb, 0, sizeof(eb));
> +     eb.buffers_ptr = to_user_pointer(obj);
> +     eb.buffer_count = ARRAY_SIZE(obj);
> +     eb.flags = engine;
> +     eb.flags |= LOCAL_I915_EXEC_NO_RELOC;
> +
> +     v = 0;
> +     igt_mean_init(&mean);
> +     igt_until_timeout(5) {
> +             eb.rsvd1 = ctx[0];
> +             eb.batch_start_offset = 0;
> +             gem_execbuf(i915, &eb);
> +
> +             while (results[0] == v)
> +                     igt_assert(gem_bo_busy(i915, obj[1].handle));

Is there a race there? Object will be busy short time
after the result appears.

> +
> +             eb.rsvd1 = ctx[1];
> +             eb.batch_start_offset = 64 * sizeof(*cs);
> +             gem_execbuf(i915, &eb);
> +
> +             *bbe = 0xa << 23;

__sync_syncronize or are you relying on the corresponding
syscalls?

-Mika

> +             gem_sync(i915, obj[1].handle);
> +             *bbe = 0x5 << 23;
> +
> +             v = results[0];
> +             igt_mean_add(&mean, (results[1] - results[2]) * rcs_clock);
> +     }
> +     igt_info("%s context switch latency%s: %.2f±%.2fus\n",
> +              name, flags & PREEMPT ? " (preempt)" : "",
> +              1e-3 * igt_mean_get(&mean),
> +              1e-3 * sqrt(igt_mean_get_variance(&mean)));
> +     munmap(results, 4096);
> +     munmap(bbe, 4096);
> +
> +     for (int i = 0; i < ARRAY_SIZE(obj); i++)
> +             gem_close(i915, obj[i].handle);
> +
> +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +             gem_context_destroy(i915, ctx[i]);
> +}
> +
>  static double clockrate(int i915, int reg)
>  {
>       volatile uint32_t *mmio;
> @@ -722,6 +938,11 @@ igt_main
>                                                             e->name,
>                                                             0);
>  
> +                             igt_subtest_f("%s-execution-latency", e->name)
> +                                     execution_latency(device,
> +                                                       e->exec_id | e->flags,
> +                                                       e->name);
> +
>                               igt_subtest_f("%s-live-dispatch-queued", 
> e->name)
>                                       latency_on_ring(device,
>                                                       e->exec_id | e->flags,
> @@ -741,12 +962,21 @@ igt_main
>                                                         e->exec_id | e->flags,
>                                                         e->name, CORK);
>  
> +                             igt_subtest_f("%s-cs", e->name)
> +                                     context_switch(device,
> +                                                    e->exec_id | e->flags,
> +                                                    e->name, 0);
>                               igt_subtest_group {
>                                       igt_fixture {
>                                               gem_require_contexts(device);
>                                               
> igt_require(gem_scheduler_has_preemption(device));
>                                       }
>  
> +                                     igt_subtest_f("%s-cs-preempt", e->name)
> +                                             context_switch(device,
> +                                                             e->exec_id | 
> e->flags,
> +                                                             e->name, 
> PREEMPT);
> +
>                                       igt_subtest_f("%s-preemption", e->name)
>                                               latency_from_ring(device,
>                                                                 e->exec_id | 
> e->flags,
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH i-g-t 1/2] i915/gem_exec_latency: Measure the latency of context switching

Reply via email to