By keeping a pointer to the next available location, we reduce the number of memory accesses needed to write to the batchbuffer.
A net ~7k reduction of .text size, 7.5k of which is from the change to intel_batchbuffer_emit_dword(). text data bss dec hex filename 4943740 195152 26192 5165084 4ed01c i965_dri.so before 4936804 195152 26192 5158148 4eb504 i965_dri.so after Combined with the previous patch, improves performance of Synmark OglBatch7 by 4.05914% +/- 1.49373% (n=270) on Haswell. --- Full disclosure: when testing on an IVB desktop, I measured a regression in the same benchmark of -4.19005% +/- 1.15188% (n=30). I don't have any explanation. Testing on other benchmarks and especially on Atom systems would be very welcome. src/mesa/drivers/dri/i965/brw_blorp.cpp | 4 +-- src/mesa/drivers/dri/i965/brw_context.h | 5 ++-- .../drivers/dri/i965/brw_performance_monitor.c | 6 ++-- src/mesa/drivers/dri/i965/brw_state_batch.c | 4 +-- src/mesa/drivers/dri/i965/brw_urb.c | 6 ++-- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 35 +++++++++++----------- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 11 ++++--- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp index 2ccfae1..eac1f00 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp @@ -226,7 +226,7 @@ retry: intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING); intel_batchbuffer_save_state(brw); drm_intel_bo *saved_bo = brw->batch.bo; - uint32_t saved_used = brw->batch.used; + uint32_t saved_used = USED_BATCH(brw->batch); uint32_t saved_state_batch_offset = brw->batch.state_batch_offset; switch (brw->gen) { @@ -245,7 +245,7 @@ retry: * reserved enough space that a wrap will never happen. */ assert(brw->batch.bo == saved_bo); - assert((brw->batch.used - saved_used) * 4 + + assert((USED_BATCH(brw->batch) - saved_used) * 4 + (saved_state_batch_offset - brw->batch.state_batch_offset) < estimated_max_batch_usage); /* Shut up compiler warnings on release build */ diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index afb714b..c7ad35e 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -875,7 +875,8 @@ struct intel_batchbuffer { #ifdef DEBUG uint16_t emit, total; #endif - uint16_t used, reserved_space; + uint16_t reserved_space; + uint32_t *map_next; uint32_t *map; uint32_t *cpu_map; #define BATCH_SZ (8192*sizeof(uint32_t)) @@ -887,7 +888,7 @@ struct intel_batchbuffer { uint8_t pipe_controls_since_last_cs_stall; struct { - uint16_t used; + uint32_t *map_next; int reloc_count; } saved; }; diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c index 0a12375..7e90e8a 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -710,7 +710,7 @@ emit_mi_report_perf_count(struct brw_context *brw, /* Make sure the commands to take a snapshot fits in a single batch. */ intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4, RENDER_RING); - int batch_used = brw->batch.used; + int batch_used = USED_BATCH(brw->batch); /* Reports apparently don't always get written unless we flush first. */ brw_emit_mi_flush(brw); @@ -754,7 +754,7 @@ emit_mi_report_perf_count(struct brw_context *brw, brw_emit_mi_flush(brw); (void) batch_used; - assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); + assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); } /** @@ -1386,7 +1386,7 @@ void brw_perf_monitor_new_batch(struct brw_context *brw) { assert(brw->batch.ring == RENDER_RING); - assert(brw->gen < 6 || brw->batch.used == 0); + assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0); if (brw->perfmon.oa_users == 0) return; diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c index a405a80..d79e0ea 100644 --- a/src/mesa/drivers/dri/i965/brw_state_batch.c +++ b/src/mesa/drivers/dri/i965/brw_state_batch.c @@ -87,7 +87,7 @@ brw_annotate_aub(struct brw_context *brw) drm_intel_aub_annotation annotations[annotation_count]; int a = 0; make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0, - 4*brw->batch.used); + 4 * USED_BATCH(brw->batch)); for (int i = brw->state_batch_count; i-- > 0; ) { uint32_t type = brw->state_batch_list[i].type; uint32_t start_offset = brw->state_batch_list[i].offset; @@ -136,7 +136,7 @@ __brw_state_batch(struct brw_context *brw, * space, then flush and try again. */ if (batch->state_batch_offset < size || - offset < 4*batch->used + batch->reserved_space) { + offset < 4 * USED_BATCH(*batch) + batch->reserved_space) { intel_batchbuffer_flush(brw); offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment); } diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c index 6fcf1b0..6078c38 100644 --- a/src/mesa/drivers/dri/i965/brw_urb.c +++ b/src/mesa/drivers/dri/i965/brw_urb.c @@ -249,10 +249,10 @@ void brw_upload_urb_fence(struct brw_context *brw) uf.bits1.cs_fence = brw->urb.size; /* erratum: URB_FENCE must not cross a 64byte cacheline */ - if ((brw->batch.used & 15) > 12) { - int pad = 16 - (brw->batch.used & 15); + if ((USED_BATCH(brw->batch) & 15) > 12) { + int pad = 16 - (USED_BATCH(brw->batch) & 15); do - brw->batch.map[brw->batch.used++] = MI_NOOP; + *brw->batch.map_next++ = MI_NOOP; while (--pad); } diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..16876ed 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -57,6 +57,7 @@ intel_batchbuffer_init(struct brw_context *brw) if (!brw->has_llc) { brw->batch.cpu_map = malloc(BATCH_SZ); brw->batch.map = brw->batch.cpu_map; + brw->batch.map_next = brw->batch.cpu_map; } } @@ -77,10 +78,10 @@ intel_batchbuffer_reset(struct brw_context *brw) drm_intel_bo_map(brw->batch.bo, true); brw->batch.map = brw->batch.bo->virtual; } + brw->batch.map_next = brw->batch.map; brw->batch.reserved_space = BATCH_RESERVED; brw->batch.state_batch_offset = brw->batch.bo->size; - brw->batch.used = 0; brw->batch.needs_sol_reset = false; brw->batch.pipe_controls_since_last_cs_stall = 0; @@ -93,7 +94,7 @@ intel_batchbuffer_reset(struct brw_context *brw) void intel_batchbuffer_save_state(struct brw_context *brw) { - brw->batch.saved.used = brw->batch.used; + brw->batch.saved.map_next = brw->batch.map_next; brw->batch.saved.reloc_count = drm_intel_gem_bo_get_reloc_count(brw->batch.bo); } @@ -103,8 +104,8 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw) { drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count); - brw->batch.used = brw->batch.saved.used; - if (brw->batch.used == 0) + brw->batch.map_next = brw->batch.saved.map_next; + if (USED_BATCH(brw->batch) == 0) brw->batch.ring = UNKNOWN_RING; } @@ -133,7 +134,7 @@ do_batch_dump(struct brw_context *brw) drm_intel_decode_set_batch_pointer(decode, batch->bo->virtual, batch->bo->offset64, - batch->used); + USED_BATCH(*batch)); } else { fprintf(stderr, "WARNING: failed to map batchbuffer (%s), " @@ -142,7 +143,7 @@ do_batch_dump(struct brw_context *brw) drm_intel_decode_set_batch_pointer(decode, batch->map, batch->bo->offset64, - batch->used); + USED_BATCH(*batch)); } drm_intel_decode_set_output_file(decode, stderr); @@ -278,7 +279,7 @@ do_flush_locked(struct brw_context *brw) if (brw->has_llc) { drm_intel_bo_unmap(batch->bo); } else { - ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map); + ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map); if (ret == 0 && batch->state_batch_offset != batch->bo->size) { ret = drm_intel_bo_subdata(batch->bo, batch->state_batch_offset, @@ -303,11 +304,11 @@ do_flush_locked(struct brw_context *brw) brw_annotate_aub(brw); if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) { - ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0, - flags); + ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch), + NULL, 0, 0, flags); } else { ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx, - 4 * batch->used, flags); + 4 * USED_BATCH(*batch), flags); } } @@ -331,7 +332,7 @@ _intel_batchbuffer_flush(struct brw_context *brw, { int ret; - if (brw->batch.used == 0) + if (USED_BATCH(brw->batch) == 0) return 0; if (brw->throttle_batch[0] == NULL) { @@ -340,7 +341,7 @@ _intel_batchbuffer_flush(struct brw_context *brw, } if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { - int bytes_for_commands = 4 * brw->batch.used; + int bytes_for_commands = 4 * USED_BATCH(brw->batch); int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset; int total_bytes = bytes_for_commands + bytes_for_state; fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + " @@ -356,7 +357,7 @@ _intel_batchbuffer_flush(struct brw_context *brw, /* Mark the end of the buffer. */ intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END); - if (brw->batch.used & 1) { + if (USED_BATCH(brw->batch) & 1) { /* Round batchbuffer usage to 2 DWORDs. */ intel_batchbuffer_emit_dword(brw, MI_NOOP); } @@ -390,7 +391,7 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw, { int ret; - ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, + ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4 * USED_BATCH(brw->batch), buffer, delta, read_domains, write_domain); assert(ret == 0); @@ -411,7 +412,7 @@ intel_batchbuffer_emit_reloc64(struct brw_context *brw, uint32_t read_domains, uint32_t write_domain, uint32_t delta) { - int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, + int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4 * USED_BATCH(brw->batch), buffer, delta, read_domains, write_domain); assert(ret == 0); @@ -435,8 +436,8 @@ intel_batchbuffer_data(struct brw_context *brw, { assert((bytes & 3) == 0); intel_batchbuffer_require_space(brw, bytes, ring); - memcpy(brw->batch.map + brw->batch.used, data, bytes); - brw->batch.used += bytes >> 2; + memcpy(brw->batch.map_next, data, bytes); + brw->batch.map_next += bytes >> 2; } static void diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index f0971e9..a12387b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -63,6 +63,9 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, uint32_t read_domains, uint32_t write_domain, uint32_t offset); + +#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map)) + static inline uint32_t float_as_int(float f) { union { @@ -83,7 +86,7 @@ static inline unsigned intel_batchbuffer_space(struct brw_context *brw) { return (brw->batch.state_batch_offset - brw->batch.reserved_space) - - brw->batch.used*4; + - USED_BATCH(brw->batch) * 4; } @@ -93,7 +96,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword) #ifdef DEBUG assert(intel_batchbuffer_space(brw) >= 4); #endif - brw->batch.map[brw->batch.used++] = dword; + *brw->batch.map_next++ = dword; assert(brw->batch.ring != UNKNOWN_RING); } @@ -135,7 +138,7 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring) intel_batchbuffer_require_space(brw, n * 4, ring); #ifdef DEBUG - brw->batch.emit = brw->batch.used; + brw->batch.emit = USED_BATCH(brw->batch); brw->batch.total = n; #endif } @@ -145,7 +148,7 @@ intel_batchbuffer_advance(struct brw_context *brw) { #ifdef DEBUG struct intel_batchbuffer *batch = &brw->batch; - unsigned int _n = batch->used - batch->emit; + unsigned int _n = USED_BATCH(*batch) - batch->emit; assert(batch->total != 0); if (_n != batch->total) { fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n", -- 2.3.6 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev