On Fri, Mar 09, 2018 at 04:09:43PM +1000, Dave Airlie wrote:
> From: Dave Airlie <airl...@redhat.com>
> 
> This is ported from the sb backend, there are some issues with
> evergreen stacks on the boundary between entries and ALU_PUSH_BEFORE
> instructions.
> 
> Whenever we are going to use a push before, we check the stack
> usage and if we have to use the workaround, then we switch to
> a separate push.
> 
> I noticed this problem dealing with some of the soft fp64 shaders,
> in nosb mode, they are quite stack happy.
More than happy actually!
On my system, I get:
[1375/1375] skip: 4, pass: 1368, fail: 3
So thanks a lot.
> 
> This fixes all the glitches and inconsistencies I've seen with them
> 
> Signed-off-by: Dave Airlie <airl...@redhat.com>
Tested-by: Elie Tournier <elie.tourn...@collabora.com>
> ---
>  src/gallium/drivers/r600/r600_shader.c | 39 
> +++++++++++++++++++++++++++-------
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/r600_shader.c 
> b/src/gallium/drivers/r600/r600_shader.c
> index 48750fb..3ca7890 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -377,7 +377,7 @@ struct r600_shader_tgsi_instruction {
>  static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct 
> pipe_stream_output_info *so, int stream, bool ind);
>  static const struct r600_shader_tgsi_instruction 
> r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], 
> cm_shader_tgsi_instruction[];
>  static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
> -static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned 
> reason);
> +static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned 
> reason);
>  static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
>  static int tgsi_else(struct r600_shader_ctx *ctx);
>  static int tgsi_endif(struct r600_shader_ctx *ctx);
> @@ -393,6 +393,15 @@ static void r600_bytecode_src(struct 
> r600_bytecode_alu_src *bc_src,
>  static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned 
> temp_reg,
>                              unsigned dst_reg, unsigned mask);
>  
> +static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
> +{
> +     if (ctx->bc->family == CHIP_HEMLOCK ||
> +         ctx->bc->family == CHIP_CYPRESS ||
> +         ctx->bc->family == CHIP_JUNIPER)
> +             return false;
> +     return true;
> +}
> +
>  static bool ctx_has_doubles(struct r600_shader_ctx *ctx)
>  {
>       if (ctx->bc->family == CHIP_ARUBA ||
> @@ -10182,7 +10191,7 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
>       return 0;
>  }
>  
> -static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
> +static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
>                                                unsigned reason)
>  {
>       struct r600_stack_info *stack = &ctx->bc->stack;
> @@ -10200,7 +10209,7 @@ static inline void callstack_update_max_depth(struct 
> r600_shader_ctx *ctx,
>               /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 
> elements on
>                * the stack must be reserved to hold the current 
> active/continue
>                * masks */
> -             if (reason == FC_PUSH_VPM) {
> +             if (reason == FC_PUSH_VPM || stack->push > 0) {
>                       elements += 2;
>               }
>               break;
> @@ -10226,7 +10235,7 @@ static inline void callstack_update_max_depth(struct 
> r600_shader_ctx *ctx,
>                *    NOTE: it seems we also need to reserve additional element 
> in some
>                *    other cases, e.g. when we have 4 levels of PUSH_VPM in 
> the shader,
>                *    then STACK_SIZE should be 2 instead of 1 */
> -             if (reason == FC_PUSH_VPM) {
> +             if (reason == FC_PUSH_VPM || stack->push > 0) {
>                       elements += 1;
>               }
>               break;
> @@ -10245,6 +10254,7 @@ static inline void callstack_update_max_depth(struct 
> r600_shader_ctx *ctx,
>  
>       if (entries > stack->max_entries)
>               stack->max_entries = entries;
> +     return elements;
>  }
>  
>  static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned 
> reason)
> @@ -10268,7 +10278,7 @@ static inline void callstack_pop(struct 
> r600_shader_ctx *ctx, unsigned reason)
>       }
>  }
>  
> -static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned 
> reason)
> +static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned 
> reason)
>  {
>       switch (reason) {
>       case FC_PUSH_VPM:
> @@ -10276,6 +10286,7 @@ static inline void callstack_push(struct 
> r600_shader_ctx *ctx, unsigned reason)
>               break;
>       case FC_PUSH_WQM:
>               ++ctx->bc->stack.push_wqm;
> +             break;
>       case FC_LOOP:
>               ++ctx->bc->stack.loop;
>               break;
> @@ -10283,7 +10294,7 @@ static inline void callstack_push(struct 
> r600_shader_ctx *ctx, unsigned reason)
>               assert(0);
>       }
>  
> -     callstack_update_max_depth(ctx, reason);
> +     return callstack_update_max_depth(ctx, reason);
>  }
>  
>  static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
> @@ -10367,12 +10378,25 @@ static int emit_if(struct r600_shader_ctx *ctx, int 
> opcode,
>                  struct r600_bytecode_alu_src *src)
>  {
>       int alu_type = CF_OP_ALU_PUSH_BEFORE;
> +     bool needs_workaround = false;
> +     int elems = callstack_push(ctx, FC_PUSH_VPM);
> +
> +     if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
> +             needs_workaround = true;
> +
> +     if (ctx->bc->chip_class == EVERGREEN && 
> ctx_needs_stack_workaround_8xx(ctx)) {
> +             unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
> +             unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
> +
> +             if (elems && (!dmod1 || !dmod2))
> +                     needs_workaround = true;
> +     }
>  
>       /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
>        * LOOP_STARTxxx for nested loops may put the branch stack into a state
>        * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
>        * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
> -     if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
> +     if (needs_workaround) {
>               r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
>               ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
>               alu_type = CF_OP_ALU;
> @@ -10384,7 +10408,6 @@ static int emit_if(struct r600_shader_ctx *ctx, int 
> opcode,
>  
>       fc_pushlevel(ctx, FC_IF);
>  
> -     callstack_push(ctx, FC_PUSH_VPM);
>       return 0;
>  }
>  
> -- 
> 2.9.5
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to