Hi Vadim, Does this patch work ? (It's still not pushed) I'm working on doing native control flow for llvm and intend to port your patch on the control flow reservation.
Vincent ----- Mail original ----- > De : Vadim Girlin <vadimgir...@gmail.com> > À : Alex Deucher <alexdeuc...@gmail.com> > Cc : mesa-dev@lists.freedesktop.org > Envoyé le : Vendredi 22 février 2013 1h37 > Objet : Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than > required v4 > > On 02/22/2013 04:23 AM, Alex Deucher wrote: >> On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin <vadimgir...@gmail.com> > wrote: >>> v4: implement exact computation taking into account wavefront size >>> >>> Signed-off-by: Vadim Girlin <vadimgir...@gmail.com> >>> --- >>> src/gallium/drivers/r600/r600_asm.c | 44 +++++++++-- >>> src/gallium/drivers/r600/r600_asm.h | 24 ++++-- >>> src/gallium/drivers/r600/r600_shader.c | 131 > ++++++++++++++++++++++----------- >>> 3 files changed, 142 insertions(+), 57 deletions(-) >>> >>> diff --git a/src/gallium/drivers/r600/r600_asm.c > b/src/gallium/drivers/r600/r600_asm.c >>> index 3632aa5..f041e27 100644 >>> --- a/src/gallium/drivers/r600/r600_asm.c >>> +++ b/src/gallium/drivers/r600/r600_asm.c >>> @@ -86,6 +86,38 @@ static struct r600_bytecode_tex > *r600_bytecode_tex(void) >>> return tex; >>> } >>> >>> +static unsigned stack_entry_size(enum radeon_family chip) { >>> + /* Wavefront size: >>> + * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ >>> + * Aruba/Sumo/Sumo2/redwood/juniper >>> + * 32: R630/R730/R710/Palm/Cedar >>> + * 16: R610/Rs780 >>> + * >>> + * Stack row size: >>> + * Wavefront Size 16 32 48 64 >>> + * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 >>> + * Columns per Row (R9xx+) 8 4 4 4 */ >>> + >>> + switch (chip) { >>> + /* FIXME: are some chips missing here? */ >>> + /* wavefront size 16 */ >>> + case CHIP_RV610: >>> + case CHIP_RS780: >> >> RV620 >> RS880 >> >> Should be 16 as well. > > Thanks, I'll add them. > > Vadim > >> >>> + /* wavefront size 32 */ >>> + case CHIP_RV630: >>> + case CHIP_RV635: >>> + case CHIP_RV730: >>> + case CHIP_RV710: >>> + case CHIP_PALM: >>> + case CHIP_CEDAR: >>> + return 8; >>> + >>> + /* wavefront size 64 */ >>> + default: >>> + return 4; >>> + } >>> +} >>> + >>> void r600_bytecode_init(struct r600_bytecode *bc, >>> enum chip_class chip_class, >>> enum radeon_family family, >>> @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc, >>> LIST_INITHEAD(&bc->cf); >>> bc->chip_class = chip_class; >>> bc->msaa_texture_mode = msaa_texture_mode; >>> + bc->stack.entry_size = stack_entry_size(family); >>> } >>> >>> static int r600_bytecode_add_cf(struct r600_bytecode *bc) >>> @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc) >>> unsigned addr; >>> int i, r; >>> >>> - if (bc->callstack[0].max > 0) >>> - bc->nstack = ((bc->callstack[0].max + 3) >> > 2) + 2; >>> + bc->nstack = bc->stack.max_entries; >>> + >>> if (bc->type == TGSI_PROCESSOR_VERTEX && > !bc->nstack) { >>> bc->nstack = 1; >>> } >>> @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode > *bc) >>> chip = '6'; >>> break; >>> } >>> - fprintf(stderr, "bytecode %d dw -- %d gprs > ---------------------\n", >>> - bc->ndw, bc->ngpr); >>> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack > -------------\n", >>> + bc->ndw, bc->ngpr, bc->nstack); >>> fprintf(stderr, "shader %d -- %c\n", index++, > chip); >>> >>> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { >>> @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc) >>> chip = '6'; >>> break; >>> } >>> - fprintf(stderr, "bytecode %d dw -- %d gprs > ---------------------\n", bc->ndw, bc->ngpr); >>> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack > -------------\n", >>> + bc->ndw, bc->ngpr, bc->nstack); >>> fprintf(stderr, " %c\n", chip); >>> >>> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { >>> diff --git a/src/gallium/drivers/r600/r600_asm.h > b/src/gallium/drivers/r600/r600_asm.h >>> index 03cd238..5a9869d 100644 >>> --- a/src/gallium/drivers/r600/r600_asm.h >>> +++ b/src/gallium/drivers/r600/r600_asm.h >>> @@ -173,16 +173,25 @@ struct r600_cf_stack_entry { >>> }; >>> >>> #define SQ_MAX_CALL_DEPTH 0x00000020 >>> -struct r600_cf_callstack { >>> - unsigned fc_sp_before_entry; >>> - int sub_desc_index; >>> - int current; >>> - int max; >>> -}; >>> >>> #define AR_HANDLE_NORMAL 0 >>> #define AR_HANDLE_RV6XX 1 /* except RV670 */ >>> >>> +struct r600_stack_info { >>> + /* current level of non-WQM PUSH operations >>> + * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */ >>> + int push; >>> + /* current level of WQM PUSH operations >>> + * (PUSH, PUSH_ELSE, PUSH_WQM) */ >>> + int push_wqm; >>> + /* current loop level */ >>> + int loop; >>> + >>> + /* required depth */ >>> + int max_entries; >>> + /* subentries per entry */ >>> + int entry_size; >>> +}; >>> >>> struct r600_bytecode { >>> enum chip_class chip_class; >>> @@ -199,8 +208,7 @@ struct r600_bytecode { >>> uint32_t *bytecode; >>> uint32_t fc_sp; >>> struct r600_cf_stack_entry fc_stack[32]; >>> - unsigned call_sp; >>> - struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH]; >>> + struct r600_stack_info stack; >>> unsigned ar_loaded; >>> unsigned ar_reg; >>> unsigned ar_chan; >>> diff --git a/src/gallium/drivers/r600/r600_shader.c > b/src/gallium/drivers/r600/r600_shader.c >>> index 8642463..404aea7 100644 >>> --- a/src/gallium/drivers/r600/r600_shader.c >>> +++ b/src/gallium/drivers/r600/r600_shader.c >>> @@ -234,7 +234,7 @@ struct r600_shader_tgsi_instruction { >>> >>> static struct r600_shader_tgsi_instruction > r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], > cm_shader_tgsi_instruction[]; >>> static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); >>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, > unsigned reason, unsigned check_max_only); >>> +static inline void callstack_push(struct r600_shader_ctx *ctx, > unsigned reason); >>> static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); >>> static int tgsi_else(struct r600_shader_ctx *ctx); >>> static int tgsi_endif(struct r600_shader_ctx *ctx); >>> @@ -412,7 +412,7 @@ static void llvm_if(struct r600_shader_ctx *ctx) >>> { >>> r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); >>> fc_pushlevel(ctx, FC_IF); >>> - callstack_check_depth(ctx, FC_PUSH_VPM, 0); >>> + callstack_push(ctx, FC_PUSH_VPM); >>> } >>> >>> static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx) >>> @@ -5522,63 +5522,107 @@ static int pops(struct r600_shader_ctx *ctx, > int pops) >>> return 0; >>> } >>> >>> -static inline void callstack_decrease_current(struct r600_shader_ctx > *ctx, unsigned reason) >>> +static inline void callstack_update_max_depth(struct r600_shader_ctx > *ctx, >>> + unsigned reason) >>> +{ >>> + struct r600_stack_info *stack = &ctx->bc->stack; >>> + unsigned elements, entries; >>> + >>> + unsigned entry_size = stack->entry_size; >>> + >>> + elements = (stack->loop + stack->push_wqm ) * entry_size; >>> + elements += stack->push; >>> + >>> + switch (ctx->bc->chip_class) { >>> + case R600: >>> + case R700: >>> + /* pre-r8xx: if any non-WQM PUSH instruction is > invoked, 2 elements on >>> + * the stack must be reserved to hold the current > active/continue >>> + * masks */ >>> + if (reason == FC_PUSH_VPM) { >>> + elements += 2; >>> + } >>> + break; >>> + >>> + case CAYMAN: >>> + /* r9xx: any stack operation on empty stack consumes 2 > additional >>> + * elements */ >>> + elements += 2; >>> + >>> + /* fallthrough */ >>> + /* FIXME: do the two elements added above cover the > cases for the >>> + * r8xx+ below? */ >>> + >>> + case EVERGREEN: >>> + /* r8xx+: 2 extra elements are not always required, but > one extra >>> + * element must be added for each of the following > cases: >>> + * 1. There is an ALU_ELSE_AFTER instruction at the > point of greatest >>> + * stack usage. >>> + * (Currently we don't use ALU_ELSE_AFTER.) >>> + * 2. There are LOOP/WQM frames on the stack when any > flavor of non-WQM >>> + * PUSH instruction executed. >>> + * >>> + * NOTE: it seems we also need to reserve additional > element in some >>> + * other cases, e.g. when we have 4 levels of > PUSH_VPM in the shader, >>> + * then STACK_SIZE should be 2 instead of 1 */ >>> + if (reason == FC_PUSH_VPM) { >>> + elements += 1; >>> + } >>> + break; >>> + >>> + default: >>> + assert(0); >>> + break; >>> + } >>> + >>> + /* NOTE: it seems STACK_SIZE is interpreted by hw as if > entry_size is 4 >>> + * for all chips, so we use 4 in the final formula, not the > real entry_size >>> + * for the chip */ >>> + entry_size = 4; >>> + >>> + entries = (elements + (entry_size - 1)) / entry_size; >>> + >>> + if (entries > stack->max_entries) >>> + stack->max_entries = entries; >>> +} >>> + >>> +static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned > reason) >>> { >>> switch(reason) { >>> case FC_PUSH_VPM: >>> - > ctx->bc->callstack[ctx->bc->call_sp].current--; >>> + --ctx->bc->stack.push; >>> + assert(ctx->bc->stack.push >= 0); >>> break; >>> case FC_PUSH_WQM: >>> + --ctx->bc->stack.push_wqm; >>> + assert(ctx->bc->stack.push_wqm >= 0); >>> + break; >>> case FC_LOOP: >>> - > ctx->bc->callstack[ctx->bc->call_sp].current -= 4; >>> + --ctx->bc->stack.loop; >>> + assert(ctx->bc->stack.loop >= 0); >>> break; >>> - case FC_REP: >>> - /* TOODO : for 16 vp asic should -= 2; */ >>> - > ctx->bc->callstack[ctx->bc->call_sp].current --; >>> + default: >>> + assert(0); >>> break; >>> } >>> } >>> >>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, > unsigned reason, unsigned check_max_only) >>> +static inline void callstack_push(struct r600_shader_ctx *ctx, > unsigned reason) >>> { >>> - if (check_max_only) { >>> - int diff; >>> - switch (reason) { >>> - case FC_PUSH_VPM: >>> - diff = 1; >>> - break; >>> - case FC_PUSH_WQM: >>> - diff = 4; >>> - break; >>> - default: >>> - assert(0); >>> - diff = 0; >>> - } >>> - if > ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) > >>> - > ctx->bc->callstack[ctx->bc->call_sp].max) { >>> - > ctx->bc->callstack[ctx->bc->call_sp].max = >>> - > ctx->bc->callstack[ctx->bc->call_sp].current + diff; >>> - } >>> - return; >>> - } >>> switch (reason) { >>> case FC_PUSH_VPM: >>> - > ctx->bc->callstack[ctx->bc->call_sp].current++; >>> + ++ctx->bc->stack.push; >>> break; >>> case FC_PUSH_WQM: >>> + ++ctx->bc->stack.push_wqm; >>> case FC_LOOP: >>> - > ctx->bc->callstack[ctx->bc->call_sp].current += 4; >>> - break; >>> - case FC_REP: >>> - > ctx->bc->callstack[ctx->bc->call_sp].current++; >>> + ++ctx->bc->stack.loop; >>> break; >>> + default: >>> + assert(0); >>> } >>> >>> - if ((ctx->bc->callstack[ctx->bc->call_sp].current) >> >>> - ctx->bc->callstack[ctx->bc->call_sp].max) { >>> - ctx->bc->callstack[ctx->bc->call_sp].max = >>> - > ctx->bc->callstack[ctx->bc->call_sp].current; >>> - } >>> + callstack_update_max_depth(ctx, reason); >>> } >>> >>> static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) >>> @@ -5665,7 +5709,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx) >>> >>> fc_pushlevel(ctx, FC_IF); >>> >>> - callstack_check_depth(ctx, FC_PUSH_VPM, 0); >>> + callstack_push(ctx, FC_PUSH_VPM); >>> return 0; >>> } >>> >>> @@ -5695,7 +5739,7 @@ static int tgsi_endif(struct r600_shader_ctx > *ctx) >>> } >>> fc_poplevel(ctx); >>> >>> - callstack_decrease_current(ctx, FC_PUSH_VPM); >>> + callstack_pop(ctx, FC_PUSH_VPM); >>> return 0; >>> } >>> >>> @@ -5708,7 +5752,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx > *ctx) >>> fc_pushlevel(ctx, FC_LOOP); >>> >>> /* check stack depth */ >>> - callstack_check_depth(ctx, FC_LOOP, 0); >>> + callstack_push(ctx, FC_LOOP); >>> return 0; >>> } >>> >>> @@ -5737,7 +5781,7 @@ static int tgsi_endloop(struct r600_shader_ctx > *ctx) >>> } >>> /* XXX add LOOPRET support */ >>> fc_poplevel(ctx); >>> - callstack_decrease_current(ctx, FC_LOOP); >>> + callstack_pop(ctx, FC_LOOP); >>> return 0; >>> } >>> >>> @@ -5760,7 +5804,6 @@ static int tgsi_loop_brk_cont(struct > r600_shader_ctx *ctx) >>> >>> fc_set_mid(ctx, fscp); >>> >>> - callstack_check_depth(ctx, FC_PUSH_VPM, 1); >>> return 0; >>> } >>> >>> -- >>> 1.8.1.2 >>> >>> _______________________________________________ >>> mesa-dev mailing list >>> mesa-dev@lists.freedesktop.org >>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev