Re: [Mesa-dev] [PATCH 5/5] i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.

Jordan Justen Thu, 28 Apr 2016 16:22:10 -0700

On 2016-04-21 22:32:09, Kenneth Graunke wrote:
> Unlike most shader stages, the Hull Shader hardware makes us explicitly
> tell it how many threads to dispatch and manually configure the channel
> mask.  One perk of this is that we have a lot of flexibility - we can
> run it in either SIMD4x2 or SIMD8 mode.
> 
> Treating it as SIMD8 means that shaders with 8 or fewer output vertices
> (which is overwhemingly the common case) can be handled by a single
> thread.  This has several intriguing properties:
> 
> - Accessing input arrays with gl_InvocationID as the index is a simple
>   SIMD8 URB read with g1 as the header.  No indirect addressing required.
> - Barriers are no-ops.
> - We could potentially do output shadowing to combine writes, as the
>   concurrency concerns are gone.  (We don't do this yet, though.)
> 
> Signed-off-by: Kenneth Graunke <kenn...@whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_compiler.c     |   4 +-
>  src/mesa/drivers/dri/i965/brw_fs.cpp         |  97 ++++++++
>  src/mesa/drivers/dri/i965/brw_fs.h           |   5 +
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp     | 356 
> +++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |   3 +
>  src/mesa/drivers/dri/i965/brw_tcs.c          |   3 +-
>  src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp   |  59 ++++-
>  7 files changed, 512 insertions(+), 15 deletions(-)
> 
> Shockingly, this appears to cut instruction counts in Unigine Heaven
> (-2.5 to 5.5%), Synmark (-31%), and Tessmark (-37%).  It increases
> instruction counts in Shadow of Mordor (up to +57%) - but again, this
> is running in scalar mode, so larger instruction counts are expected :)
> I also have a bunch of optimizations in progress that will help those.
> 
> Cycle counts look pretty good too.
> 
> This patch leaves it off by default because I haven't properly benchmarked
> it yet.  I fully expect we'll turn it on by default.
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c 
> b/src/mesa/drivers/dri/i965/brw_compiler.c
> index 4496699..93a30a5 100644
> --- a/src/mesa/drivers/dri/i965/brw_compiler.c
> +++ b/src/mesa/drivers/dri/i965/brw_compiler.c
> @@ -152,7 +152,8 @@ brw_compiler_create(void *mem_ctx, const struct 
> brw_device_info *devinfo)
>  
>     compiler->scalar_stage[MESA_SHADER_VERTEX] =
>        devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
> -   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
> +   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
> +      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", false);
>     compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
>        devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
>     compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
> @@ -194,6 +195,7 @@ brw_compiler_create(void *mem_ctx, const struct 
> brw_device_info *devinfo)
>  
>     
> compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = 
> false;
>     
> compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = 
> false;
> +   
> compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = 
> false;
>  
>     if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
>        
> compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = 
> false;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 5d6a107..be5edb8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1758,6 +1758,21 @@ fs_visitor::assign_vs_urb_setup()
>  }
>  
>  void
> +fs_visitor::assign_tcs_single_patch_urb_setup()
> +{
> +   assert(stage == MESA_SHADER_TESS_CTRL);
> +
> +   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
> +
> +   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;


We talked about this bit of code offline. This is just a reminder that
you were going to check to see if it is used currently.

Patches 3-5 Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com>

> +
> +   /* Rewrite all ATTR file references to HW_REGs. */
> +   foreach_block_and_inst(block, fs_inst, inst, cfg) {
> +      convert_attr_sources_to_hw_regs(inst);
> +   }
> +}
> +
> +void
>  fs_visitor::assign_tes_urb_setup()
>  {
>     assert(stage == MESA_SHADER_TESS_EVAL);
> @@ -5463,6 +5478,88 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
>  }
>  
>  bool
> +fs_visitor::run_tcs_single_patch()
> +{
> +   assert(stage == MESA_SHADER_TESS_CTRL);
> +
> +   struct brw_tcs_prog_data *tcs_prog_data =
> +      (struct brw_tcs_prog_data *) prog_data;
> +
> +   /* r1-r4 contain the ICP handles. */
> +   payload.num_regs = 5;
> +
> +   if (shader_time_index >= 0)
> +      emit_shader_time_begin();
> +
> +   /* Initialize gl_InvocationID */
> +   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
> +   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
> +   bld.MOV(channels_ud, channels_uw);
> +
> +   if (tcs_prog_data->instances == 1) {
> +      invocation_id = channels_ud;
> +   } else {
> +      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +
> +      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
> +      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
> +              brw_imm_ud(INTEL_MASK(23, 17)));
> +      bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
> +
> +      bld.ADD(invocation_id, instance_times_8, channels_ud);
> +   }
> +
> +   /* Fix the disptach mask */
> +   if (nir->info.tcs.vertices_out % 8) {
> +      bld.CMP(bld.null_reg_ud(), invocation_id,
> +              brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
> +      bld.IF(BRW_PREDICATE_NORMAL);
> +   }
> +
> +   emit_nir_code();
> +
> +   if (nir->info.tcs.vertices_out % 8) {
> +      bld.emit(BRW_OPCODE_ENDIF);
> +   }
> +
> +   /* Emit EOT write; set TR DS Cache bit */
> +   fs_reg srcs[3] = {
> +      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
> +      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
> +      fs_reg(brw_imm_ud(0)),
> +   };
> +   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
> +   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
> +
> +   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
> +                            bld.null_reg_ud(), payload);
> +   inst->mlen = 3;
> +   inst->base_mrf = -1;
> +   inst->eot = true;
> +
> +   if (shader_time_index >= 0)
> +      emit_shader_time_end();
> +
> +   if (failed)
> +      return false;
> +
> +   calculate_cfg();
> +
> +   optimize();
> +
> +   assign_curb_setup();
> +   assign_tcs_single_patch_urb_setup();
> +
> +   fixup_3src_null_dest();
> +   allocate_registers();
> +
> +   return !failed;
> +}
> +
> +bool
>  fs_visitor::run_tes()
>  {
>     assert(stage == MESA_SHADER_TESS_EVAL);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
> b/src/mesa/drivers/dri/i965/brw_fs.h
> index bcd2e3e..f24c78a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -110,6 +110,7 @@ public:
>  
>     bool run_fs(bool do_rep_send);
>     bool run_vs(gl_clip_plane *clip_planes);
> +   bool run_tcs_single_patch();
>     bool run_tes();
>     bool run_gs();
>     bool run_cs();
> @@ -126,6 +127,7 @@ public:
>     void assign_urb_setup();
>     void convert_attr_sources_to_hw_regs(fs_inst *inst);
>     void assign_vs_urb_setup();
> +   void assign_tcs_single_patch_urb_setup();
>     void assign_tes_urb_setup();
>     void assign_gs_urb_setup();
>     bool assign_regs(bool allow_spilling);
> @@ -249,6 +251,8 @@ public:
>                         nir_ssa_undef_instr *instr);
>     void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
>                                nir_intrinsic_instr *instr);
> +   void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
> +                               nir_intrinsic_instr *instr);
>     void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
>                                nir_intrinsic_instr *instr);
>     void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
> @@ -404,6 +408,7 @@ public:
>     fs_reg userplane[MAX_CLIP_PLANES];
>     fs_reg final_gs_vertex_count;
>     fs_reg control_data_bits;
> +   fs_reg invocation_id;
>  
>     unsigned grf_used;
>     bool spilled_any_registers;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index cf4f782..e617083 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -114,6 +114,9 @@ fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
>  void
>  fs_visitor::nir_setup_outputs()
>  {
> +   if (stage == MESA_SHADER_TESS_CTRL)
> +      return;
> +
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
>  
>     nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
> @@ -232,6 +235,8 @@ emit_system_values_block(nir_block *block, void 
> *void_visitor)
>           break;
>  
>        case nir_intrinsic_load_invocation_id:
> +         if (v->stage == MESA_SHADER_TESS_CTRL)
> +            break;
>           assert(v->stage == MESA_SHADER_GEOMETRY);
>           reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
>           if (reg->file == BAD_FILE) {
> @@ -452,6 +457,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
>        case MESA_SHADER_VERTEX:
>           nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
>           break;
> +      case MESA_SHADER_TESS_CTRL:
> +         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
> +         break;
>        case MESA_SHADER_TESS_EVAL:
>           nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
>           break;
> @@ -1901,6 +1909,354 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder 
> &bld,
>  }
>  
>  void
> +fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
> +                                   nir_intrinsic_instr *instr)
> +{
> +   assert(stage == MESA_SHADER_TESS_CTRL);
> +   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
> +   struct brw_tcs_prog_data *tcs_prog_data =
> +      (struct brw_tcs_prog_data *) prog_data;
> +
> +   fs_reg dst;
> +   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> +      dst = get_nir_dest(instr->dest);
> +
> +   switch (instr->intrinsic) {
> +   case nir_intrinsic_load_primitive_id:
> +      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
> +      break;
> +   case nir_intrinsic_load_invocation_id:
> +      bld.MOV(retype(dst, invocation_id.type), invocation_id);
> +      break;
> +   case nir_intrinsic_load_patch_vertices_in:
> +      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
> +              brw_imm_d(tcs_key->input_vertices));
> +      break;
> +
> +   case nir_intrinsic_barrier: {
> +      if (tcs_prog_data->instances == 1)
> +         break;
> +
> +      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
> +
> +      const fs_builder fwa_bld = bld.exec_all();
> +
> +      /* Zero the message header */
> +      fwa_bld.MOV(m0, brw_imm_ud(0u));
> +
> +      /* Copy "Barrier ID" from r0.2, bits 16:13 */
> +      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
> +                  brw_imm_ud(INTEL_MASK(16, 13)));
> +
> +      /* Shift it up to bits 27:24. */
> +      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
> +
> +      /* Set the Barrier Count and the enable bit */
> +      fwa_bld.OR(m0_2, m0_2,
> +                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
> +
> +      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
> +      break;
> +   }
> +
> +   case nir_intrinsic_load_input:
> +      unreachable("nir_lower_io should never give us these.");
> +      break;
> +
> +   case nir_intrinsic_load_per_vertex_input: {
> +      fs_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];
> +
> +      const nir_src &vertex_src = instr->src[0];
> +      nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
> +
> +      fs_inst *inst;
> +
> +      fs_reg icp_handle;
> +
> +      if (vertex_const) {
> +         /* Emit a MOV to resolve <0,1,0> regioning. */
> +         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         bld.MOV(icp_handle,
> +                 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
> +                                     vertex_const->i32[0] & 7),
> +                        BRW_REGISTER_TYPE_UD));
> +      } else if (tcs_prog_data->instances == 1 &&
> +                 vertex_src.is_ssa &&
> +                 vertex_src.ssa->parent_instr->type == 
> nir_instr_type_intrinsic &&
> +                 
> nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == 
> nir_intrinsic_load_invocation_id) {
> +         /* For the common case of only 1 instance, an array index of
> +          * gl_InvocationID means reading g1.  Skip all the indirect work.
> +          */
> +         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
> +      } else {
> +         /* The vertex index is non-constant.  We need to use indirect
> +          * addressing to fetch the proper URB handle.
> +          */
> +         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +
> +         /* Each ICP handle is a single DWord (4 bytes) */
> +         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         bld.SHL(vertex_offset_bytes,
> +                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
> +                 brw_imm_ud(2u));
> +
> +         /* Start at g1.  We might read up to 4 registers. */
> +         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
> +                  fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes,
> +                  brw_imm_ud(4 * REG_SIZE));
> +      }
> +
> +      if (indirect_offset.file == BAD_FILE) {
> +         /* Constant indexing - use global offset. */
> +         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
> +         inst->offset = imm_offset;
> +         inst->mlen = 1;
> +         inst->base_mrf = -1;
> +         inst->regs_written = instr->num_components;
> +      } else {
> +         /* Indirect indexing - use per-slot offsets as well. */
> +         const fs_reg srcs[] = { icp_handle, indirect_offset };
> +         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> +         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
> +
> +         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 
> payload);
> +         inst->offset = imm_offset;
> +         inst->base_mrf = -1;
> +         inst->mlen = 2;
> +         inst->regs_written = instr->num_components;
> +      }
> +
> +      /* Copy the temporary to the destination to deal with writemasking.
> +       *
> +       * Also attempt to deal with gl_PointSize being in the .w component.
> +       */
> +      if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
> +         inst->dst = bld.vgrf(dst.type, 4);
> +         inst->regs_written = 4;
> +         bld.MOV(dst, offset(inst->dst, bld, 3));
> +      }
> +      break;
> +   }
> +
> +   case nir_intrinsic_load_output:
> +   case nir_intrinsic_load_per_vertex_output: {
> +      fs_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];
> +
> +      fs_inst *inst;
> +      if (indirect_offset.file == BAD_FILE) {
> +         /* Replicate the patch handle to all enabled channels */
> +         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         bld.MOV(patch_handle,
> +                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +
> +         if (imm_offset == 0) {
> +            /* This is a read of gl_TessLevelInner[], which lives in the
> +             * Patch URB header.  The layout depends on the domain.
> +             */
> +            dst.type = BRW_REGISTER_TYPE_F;
> +            switch (tcs_key->tes_primitive_mode) {
> +            case GL_QUADS: {
> +               /* DWords 3-2 (reversed) */
> +               fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +
> +               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 
> patch_handle);
> +               inst->offset = 0;
> +               inst->mlen = 1;
> +               inst->base_mrf = -1;
> +               inst->regs_written = 4;
> +
> +               /* dst.xy = tmp.wz */
> +               bld.MOV(dst,                 offset(tmp, bld, 3));
> +               bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
> +               break;
> +            }
> +            case GL_TRIANGLES:
> +               /* DWord 4; hardcode offset = 1 and regs_written = 1 */
> +               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 
> patch_handle);
> +               inst->offset = 1;
> +               inst->mlen = 1;
> +               inst->base_mrf = -1;
> +               inst->regs_written = 1;
> +               break;
> +            case GL_ISOLINES:
> +               /* All channels are undefined. */
> +               break;
> +            default:
> +               unreachable("Bogus tessellation domain");
> +            }
> +         } else if (imm_offset == 1) {
> +            /* This is a read of gl_TessLevelOuter[], which lives in the
> +             * Patch URB header.  The layout depends on the domain.
> +             */
> +            dst.type = BRW_REGISTER_TYPE_F;
> +
> +            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
> +            inst->offset = 1;
> +            inst->mlen = 1;
> +            inst->base_mrf = -1;
> +            inst->regs_written = 4;
> +
> +            /* Reswizzle: WZYX */
> +            fs_reg srcs[4] = {
> +               offset(tmp, bld, 3),
> +               offset(tmp, bld, 2),
> +               offset(tmp, bld, 1),
> +               offset(tmp, bld, 0),
> +            };
> +
> +            unsigned num_components;
> +            switch (tcs_key->tes_primitive_mode) {
> +            case GL_QUADS:
> +               num_components = 4;
> +               break;
> +            case GL_TRIANGLES:
> +               num_components = 3;
> +               break;
> +            case GL_ISOLINES:
> +               /* Isolines are not reversed; swizzle .zw -> .xy */
> +               srcs[0] = offset(tmp, bld, 2);
> +               srcs[1] = offset(tmp, bld, 3);
> +               num_components = 2;
> +               break;
> +            default:
> +               unreachable("Bogus tessellation domain");
> +            }
> +            bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
> +         } else {
> +            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
> +            inst->offset = imm_offset;
> +            inst->mlen = 1;
> +            inst->base_mrf = -1;
> +            inst->regs_written = instr->num_components;
> +         }
> +      } else {
> +         /* Indirect indexing - use per-slot offsets as well. */
> +         const fs_reg srcs[] = {
> +            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
> +            indirect_offset
> +         };
> +         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> +         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
> +
> +         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 
> payload);
> +         inst->offset = imm_offset;
> +         inst->mlen = 2;
> +         inst->base_mrf = -1;
> +         inst->regs_written = instr->num_components;
> +      }
> +      break;
> +   }
> +
> +   case nir_intrinsic_store_output:
> +   case nir_intrinsic_store_per_vertex_output: {
> +      fs_reg value = get_nir_src(instr->src[0]);
> +      fs_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];
> +      unsigned swiz = BRW_SWIZZLE_XYZW;
> +      unsigned mask = instr->const_index[1];
> +      unsigned header_regs = 0;
> +      fs_reg srcs[7];
> +      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
> +
> +      if (indirect_offset.file != BAD_FILE) {
> +         srcs[header_regs++] = indirect_offset;
> +      } else if (tcs_key->program_string_id != 0) {
> +         if (imm_offset == 0) {
> +            value.type = BRW_REGISTER_TYPE_F;
> +
> +            mask &= (1 << 
> tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
> +
> +            /* This is a write to gl_TessLevelInner[], which lives in the
> +             * Patch URB header.  The layout depends on the domain.
> +             */
> +            switch (tcs_key->tes_primitive_mode) {
> +            case GL_QUADS:
> +               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
> +                * We use an XXYX swizzle to reverse put .xy in the .wz
> +                * channels, and use a .zw writemask.
> +                */
> +               mask = writemask_for_backwards_vector(mask);
> +               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
> +               break;
> +            case GL_TRIANGLES:
> +               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
> +                * writemask to X and bump the URB offset by 1.
> +                */
> +               imm_offset = 1;
> +               break;
> +            case GL_ISOLINES:
> +               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
> +               return;
> +            default:
> +               unreachable("Bogus tessellation domain");
> +            }
> +         } else if (imm_offset == 1) {
> +            /* This is a write to gl_TessLevelOuter[] which lives in the
> +             * Patch URB Header at DWords 4-7.  However, it's reversed, so
> +             * instead of .xyzw we have .wzyx.
> +             */
> +            value.type = BRW_REGISTER_TYPE_F;
> +
> +            mask &= (1 << 
> tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
> +
> +            if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
> +               /* Isolines .xy should be stored in .zw, in order. */
> +               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
> +               mask <<= 2;
> +            } else {
> +               /* Other domains are reversed; store .wzyx instead of .xyzw */
> +               swiz = BRW_SWIZZLE_WZYX;
> +               mask = writemask_for_backwards_vector(mask);
> +            }
> +         }
> +      }
> +
> +      if (mask == 0)
> +         break;
> +
> +      unsigned num_components = _mesa_fls(mask);
> +      enum opcode opcode;
> +
> +      if (mask != WRITEMASK_XYZW) {
> +         srcs[header_regs++] = brw_imm_ud(mask << 16);
> +         opcode = indirect_offset.file != BAD_FILE ?
> +            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
> +            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
> +      } else {
> +         opcode = indirect_offset.file != BAD_FILE ?
> +            SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
> +            SHADER_OPCODE_URB_WRITE_SIMD8;
> +      }
> +
> +      for (unsigned i = 0; i < num_components; i++) {
> +         if (mask & (1 << i))
> +            srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i));
> +      }
> +
> +      unsigned mlen = header_regs + num_components;
> +
> +      fs_reg payload =
> +         bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
> +      bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
> +
> +      fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
> +      inst->offset = imm_offset;
> +      inst->mlen = mlen;
> +      inst->base_mrf = -1;
> +      break;
> +   }
> +
> +   default:
> +      nir_emit_intrinsic(bld, instr);
> +      break;
> +   }
> +}
> +
> +void
>  fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
>                                     nir_intrinsic_instr *instr)
>  {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index daabf70..41a9b12 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1014,6 +1014,9 @@ fs_visitor::init()
>     case MESA_SHADER_VERTEX:
>        key_tex = &((const brw_vs_prog_key *) key)->tex;
>        break;
> +   case MESA_SHADER_TESS_CTRL:
> +      key_tex = &((const brw_tcs_prog_key *) key)->tex;
> +      break;
>     case MESA_SHADER_TESS_EVAL:
>        key_tex = &((const brw_tes_prog_key *) key)->tex;
>        break;
> diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c 
> b/src/mesa/drivers/dri/i965/brw_tcs.c
> index 0117ffe..98ed2b2 100644
> --- a/src/mesa/drivers/dri/i965/brw_tcs.c
> +++ b/src/mesa/drivers/dri/i965/brw_tcs.c
> @@ -214,7 +214,8 @@ brw_codegen_tcs_prog(struct brw_context *brw,
>        prog_data.base.base.nr_image_params = tcs->NumImages;
>  
>        brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
> -                                  &prog_data.base.base, false);
> +                                  &prog_data.base.base,
> +                                  
> compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
>     } else {
>        /* Upload the Patch URB Header as the first two uniforms.
>         * Do the annoying scrambling so the shader doesn't have to.
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> index 17e3448..79cf93e 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> @@ -29,6 +29,7 @@
>  
>  #include "brw_nir.h"
>  #include "brw_vec4_tcs.h"
> +#include "brw_fs.h"
>  
>  namespace brw {
>  
> @@ -452,7 +453,10 @@ brw_compile_tcs(const struct brw_compiler *compiler,
>     brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
>     nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
>  
> -   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
> +   if (is_scalar)
> +      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8);
> +   else
> +      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
>  
>     /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
>      * That divides up as follows:
> @@ -493,20 +497,49 @@ brw_compile_tcs(const struct brw_compiler *compiler,
>        brw_print_vue_map(stderr, &vue_prog_data->vue_map);
>     }
>  
> -   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
> -                      nir, mem_ctx, shader_time_index, &input_vue_map);
> -   if (!v.run()) {
> -      if (error_str)
> -         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> -      return NULL;
> -   }
> +   if (is_scalar) {
> +      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
> +                   &prog_data->base.base, NULL, nir, 8,
> +                   shader_time_index, &input_vue_map);
> +      if (!v.run_tcs_single_patch()) {
> +         if (error_str)
> +            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> +         return NULL;
> +      }
>  
> -   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> -      v.dump_instructions();
> +      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
> +
> +      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
> +                     &prog_data->base.base, v.promoted_constants, false,
> +                     MESA_SHADER_TESS_CTRL);
> +      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
> +         g.enable_debug(ralloc_asprintf(mem_ctx,
> +                                        "%s tessellation control shader %s",
> +                                        nir->info.label ? nir->info.label
> +                                                        : "unnamed",
> +                                        nir->info.name));
> +      }
> +
> +      g.generate_code(v.cfg, 8);
>  
> -   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
> -                                     &prog_data->base, v.cfg,
> -                                     final_assembly_size);
> +      return g.get_assembly(final_assembly_size);
> +   } else {
> +      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
> +                         nir, mem_ctx, shader_time_index, &input_vue_map);
> +      if (!v.run()) {
> +         if (error_str)
> +            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> +         return NULL;
> +      }
> +
> +      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> +         v.dump_instructions();
> +
> +
> +      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
> +                                        &prog_data->base, v.cfg,
> +                                        final_assembly_size);
> +   }
>  }
>  
>  
> -- 
> 2.8.0
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 5/5] i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.

Reply via email to