On Mon, Aug 10, 2015 at 1:47 PM, Marek Olšák <mar...@gmail.com> wrote: > Please never use "long" in Mesa. It only has 32 bits on 32-bit > systems. uint64_t is generally used for all unsigned 64-bit variables > and "llu" or "ull" is the number suffix. Also, the 64-bit ctz is ctzll > and a proper HAVE macro should be added for it too.
Well I intentionally chose long to have a machine-word sized type, for uint64_t gcc would have to emit multiple instructions for bit setting and ctzll on 32bit CPUs (actually a library call for ctzll from what I see), so the idea was to use longer array there instead. > The general idea is nice, thanks. > > The number of atoms can be cut down by merging all scissors states > into 1 atom (just as there is 1 atom for 16 textures, there can be 1 > atom for 16 scissors) and the same applies to viewport states. This > would simplify the code, because all dirty bits would fit into 64 bits > and there would even be some space left. This sounds good, I'll try to work on it when I can find time. Also, do you know why the atoms start from 4 for r600, and 0-3 seem to be unused? (also CC'ing Jerome, seems to be his code) Gražvydas > > Patches 1-3: > Reviewed-by: Marek Olšák <marek.ol...@amd.com> > > Marek > > On Sun, Aug 9, 2015 at 11:42 PM, Grazvydas Ignotas <nota...@gmail.com> wrote: >> r600 currently has 73 atoms and looping through their dirty flags has >> become costly because checking each flag requires a pointer >> dereference before the read. To avoid having to do that add additional >> bitfield which can be checked really quickly thanks to tzcnt instruction. >> >> id field was added to struct r600_atom but that doesn't affect memory >> usage for both 32 and 64 bit CPUs because it was stuffed into padding. >> >> The performance improvement is ~2% for benchmarks that can have FPS in >> the thousands but is hardly measurable in "real" programs. >> --- >> src/gallium/drivers/r600/r600_hw_context.c | 12 +++---- >> src/gallium/drivers/r600/r600_pipe.h | 45 >> +++++++++++++++++++++++++++ >> src/gallium/drivers/r600/r600_state_common.c | 8 ++--- >> src/gallium/drivers/radeon/r600_pipe_common.h | 1 + >> 4 files changed, 56 insertions(+), 10 deletions(-) >> >> diff --git a/src/gallium/drivers/r600/r600_hw_context.c >> b/src/gallium/drivers/r600/r600_hw_context.c >> index c048a71..6445151 100644 >> --- a/src/gallium/drivers/r600/r600_hw_context.c >> +++ b/src/gallium/drivers/r600/r600_hw_context.c >> @@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, >> unsigned num_dw, >> unsigned i; >> >> /* The number of dwords all the dirty states would take. */ >> - for (i = 0; i < R600_NUM_ATOMS; i++) { >> - if (ctx->atoms[i] && ctx->atoms[i]->dirty) { >> - num_dw += ctx->atoms[i]->num_dw; >> - if (ctx->screen->b.trace_bo) { >> - num_dw += R600_TRACE_CS_DWORDS; >> - } >> + i = r600_next_dirty_atom(ctx, 0); >> + while (i < R600_NUM_ATOMS) { >> + num_dw += ctx->atoms[i]->num_dw; >> + if (ctx->screen->b.trace_bo) { >> + num_dw += R600_TRACE_CS_DWORDS; >> } >> + i = r600_next_dirty_atom(ctx, i + 1); >> } >> >> /* The upper-bound of how much space a draw command would >> take. */ >> diff --git a/src/gallium/drivers/r600/r600_pipe.h >> b/src/gallium/drivers/r600/r600_pipe.h >> index 8b61269..5d10bb4 100644 >> --- a/src/gallium/drivers/r600/r600_pipe.h >> +++ b/src/gallium/drivers/r600/r600_pipe.h >> @@ -85,6 +85,9 @@ >> #define R600_BIG_ENDIAN 0 >> #endif >> >> +#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8) >> +#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, >> R600_DIRTY_ATOM_WORD_BITS) >> + >> struct r600_context; >> struct r600_bytecode; >> struct r600_shader_key; >> @@ -426,6 +429,8 @@ struct r600_context { >> >> /* State binding slots are here. */ >> struct r600_atom *atoms[R600_NUM_ATOMS]; >> + /* Dirty atom bitmask for fast tests */ >> + unsigned long >> dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN]; >> /* States for CS initialization. */ >> struct r600_command_buffer start_cs_cmd; /* invariant state >> mostly */ >> /** Compute specific registers initializations. The start_cs_cmd >> atom >> @@ -502,7 +507,18 @@ static inline void r600_set_atom_dirty(struct >> r600_context *rctx, >> struct r600_atom *atom, >> bool dirty) >> { >> + unsigned long mask; >> + unsigned int w; >> + >> atom->dirty = dirty; >> + >> + assert(atom->id != 0); >> + w = atom->id / R600_DIRTY_ATOM_WORD_BITS; >> + mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS); >> + if (dirty) >> + rctx->dirty_atoms[w] |= mask; >> + else >> + rctx->dirty_atoms[w] &= ~mask; >> } >> >> static inline void r600_mark_atom_dirty(struct r600_context *rctx, >> @@ -511,6 +527,35 @@ static inline void r600_mark_atom_dirty(struct >> r600_context *rctx, >> r600_set_atom_dirty(rctx, atom, true); >> } >> >> +static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx, >> + unsigned int id) >> +{ >> +#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ) >> + unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS; >> + unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS; >> + unsigned long bits, mask = (1ul << bit) - 1; >> + >> + for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) { >> + bits = rctx->dirty_atoms[w] & ~mask; >> + if (bits == 0) >> + continue; >> + return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits); >> + } >> + >> + return R600_NUM_ATOMS; >> +#else >> + for (; id < R600_NUM_ATOMS; id++) { >> + bool dirty = !!(rctx->dirty_atoms[id / >> R600_DIRTY_ATOM_WORD_BITS] & >> + (1ul << (id % R600_DIRTY_ATOM_WORD_BITS))); >> + assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty)); >> + if (dirty) >> + break; >> + } >> + >> + return id; >> +#endif >> +} >> + >> void r600_trace_emit(struct r600_context *rctx); >> >> static inline void r600_emit_atom(struct r600_context *rctx, struct >> r600_atom *atom) >> diff --git a/src/gallium/drivers/r600/r600_state_common.c >> b/src/gallium/drivers/r600/r600_state_common.c >> index a4238a2..8d0942f 100644 >> --- a/src/gallium/drivers/r600/r600_state_common.c >> +++ b/src/gallium/drivers/r600/r600_state_common.c >> @@ -54,6 +54,7 @@ void r600_add_atom(struct r600_context *rctx, >> assert(id < R600_NUM_ATOMS); >> assert(rctx->atoms[id] == NULL); >> rctx->atoms[id] = atom; >> + atom->id = id; >> atom->dirty = false; >> } >> >> @@ -1476,11 +1477,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, >> const struct pipe_draw_info >> r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE); >> r600_flush_emit(rctx); >> >> - for (i = 0; i < R600_NUM_ATOMS; i++) { >> - if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) { >> - continue; >> - } >> + i = r600_next_dirty_atom(rctx, 0); >> + while (i < R600_NUM_ATOMS) { >> r600_emit_atom(rctx, rctx->atoms[i]); >> + i = r600_next_dirty_atom(rctx, i + 1); >> } >> >> if (rctx->b.chip_class == CAYMAN) { >> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h >> b/src/gallium/drivers/radeon/r600_pipe_common.h >> index 717e248..17da41e 100644 >> --- a/src/gallium/drivers/radeon/r600_pipe_common.h >> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h >> @@ -314,6 +314,7 @@ struct r600_common_screen { >> struct r600_atom { >> void (*emit)(struct r600_common_context *ctx, struct r600_atom >> *state); >> unsigned num_dw; >> + unsigned short id; /* used by r600 only */ >> bool dirty; >> }; >> >> -- >> 1.9.1 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev